diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 087f11b5..b7575c4a 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, if command.help or command.subcommand is None: command.subcommand = 'help' - if command.version: + elif command.version: command.subcommand = 'version' + + if command.subcommand not in ('help', 'version', 'status'): + from ..cli.logging import log_cli_command + + log_cli_command( + subcommand=command.subcommand, + subcommand_args=command.subcommand_args, + stdin=stdin, + pwd=pwd or OUTPUT_DIR + ) run_subcommand( subcommand=command.subcommand, diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c4c78399..55832346 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, accept_stdin +from .logging import SmartFormatter, accept_stdin, stderr @docstring(add.__doc__) @@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Add the links to the main index without archiving them", ) parser.add_argument( - 'import_path', - nargs='?', + 'urls', + nargs='*', type=str, default=None, help=( - 'URL or path to local file to start the archiving process from. e.g.:\n' + 'URLs or paths to archive e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' ' https://example.com\n' @@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional "--depth", action="store", default=0, - choices=[0,1], + choices=[0, 1], type=int, help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - import_string = accept_stdin(stdin) - if import_string and command.import_path: + urls = command.urls + stdin_urls = accept_stdin(stdin) + if (stdin_urls and urls) or (not stdin and not urls): stderr( - '[X] You should pass an import path or a page url as an argument or in stdin but not both\n', + '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', color='red', ) raise SystemExit(2) - elif import_string: - import_path = import_string - else: - import_path = command.import_path - add( - url=import_path, + urls=stdin_urls or urls, depth=command.depth, update_all=command.update_all, index_only=command.index_only, diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index 6de78d8f..a12c4e98 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -5,10 +5,12 @@ import os import sys import time import argparse +import logging +import signal +from multiprocessing import Process from datetime import datetime from dataclasses import dataclass -from multiprocessing import Process from typing import Optional, List, Dict, Union, IO from ..index.schema import Link, ArchiveResult @@ -23,11 +25,11 @@ from ..config import ( SHOW_PROGRESS, TERM_WIDTH, OUTPUT_DIR, + SOURCES_DIR_NAME, HTML_INDEX_FILENAME, stderr, ) - @dataclass class RuntimeStats: """mutable stats counter for logging archiving timing info to CLI output""" @@ -98,9 +100,9 @@ class TimedProgress: if SHOW_PROGRESS: # terminate if we havent already terminated - if self.p is not None: - self.p.terminate() - self.p = None + self.p.terminate() + self.p.join() + self.p.close() # clear whole terminal line try: @@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None: seconds, )) sys.stdout.flush() - except KeyboardInterrupt: + except (KeyboardInterrupt, BrokenPipeError): print() pass +def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): + from ..config import VERSION, ANSI + cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) + stdin_hint = ' < /dev/stdin' if not stdin.isatty() else '' + print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format( + now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + VERSION=VERSION, + cmd=cmd, + stdin_hint=stdin_hint, + **ANSI, + )) + print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI)) + print() + ### Parsing Stage -def log_parsing_started(source_file: str): - start_ts = datetime.now() - _LAST_RUN_STATS.parse_start_ts = start_ts - print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - source_file.rsplit('/', 1)[-1], + +def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool): + _LAST_RUN_STATS.parse_start_ts = datetime.now() + print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format( + _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'), + len(urls) if isinstance(urls, list) else len(urls.split('\n')), + depth, + ' (index only)' if index_only else '', **ANSI, )) +def log_source_saved(source_file: str): + print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) -def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str): - end_ts = datetime.now() - _LAST_RUN_STATS.parse_end_ts = end_ts - print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links)) +def log_parsing_finished(num_parsed: int, parser_name: str): + _LAST_RUN_STATS.parse_end_ts = datetime.now() + print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name)) +def log_deduping_finished(num_new_links: int): + print(' > Found {} new URLs not already in index'.format(num_new_links)) + + +def log_crawl_started(new_links): + print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI)) ### Indexing Stage @@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int): start_ts = datetime.now() _LAST_RUN_STATS.index_start_ts = start_ts print() - print('{green}[*] [{}] Writing {} links to main index...{reset}'.format( + print('{black}[*] [{}] Writing {} links to main index...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), num_links, **ANSI, @@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None): **ANSI, )) else: - print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format( + print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), num_links, **ANSI, diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index c6a4f33c..c08e7c0c 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors' import os -from typing import Optional +from typing import Optional, List from datetime import datetime from ..index.schema import Link @@ -13,6 +13,9 @@ from ..index import ( ) from ..util import enforce_types from ..cli.logging import ( + log_archiving_started, + log_archiving_paused, + log_archiving_finished, log_link_archiving_started, log_link_archiving_finished, log_archive_method_started, @@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) raise return link + + +@enforce_types +def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]: + if not links: + return [] + + log_archiving_started(len(links)) + idx: int = 0 + link: Link = links[0] + try: + for idx, link in enumerate(links): + archive_link(link, out_dir=link.link_dir) + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link.timestamp) + raise SystemExit(0) + except BaseException: + print() + raise + + log_archiving_finished(len(links)) + return links diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index e82cfefa..7ea473d7 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -33,8 +33,8 @@ from ..cli.logging import ( log_indexing_process_finished, log_indexing_started, log_indexing_finished, - log_parsing_started, log_parsing_finished, + log_deduping_finished, ) from .schema import Link, ArchiveResult @@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: return None + @enforce_types -def import_new_links(existing_links: List[Link], - import_path: str, - out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]: +def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links new_links: List[Link] = [] # parse and validate the import file - log_parsing_started(import_path) - raw_links, parser_name = parse_links(import_path) + raw_links, parser_name = parse_links(source_path) new_links = validate_links(raw_links) + if parser_name: + num_parsed = len(raw_links) + log_parsing_finished(num_parsed, parser_name) + + return new_links + + +@enforce_types +def dedupe_links(existing_links: List[Link], + new_links: List[Link]) -> Tuple[List[Link], List[Link]]: + + from ..parsers import parse_links + # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) all_link_urls = {link.url for link in existing_links} @@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link], link for link in new_links if link.url not in all_link_urls ] - - if parser_name: - num_parsed = len(raw_links) - num_new_links = len(all_links) - len(existing_links) - log_parsing_finished(num_parsed, num_new_links, parser_name) + log_deduping_finished(len(new_links)) return all_links, new_links diff --git a/archivebox/main.py b/archivebox/main.py index a6e04dd3..54b71acc 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -4,8 +4,7 @@ import os import sys import shutil -from typing import Dict, List, Optional, Iterable, IO - +from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices from .cli import ( @@ -17,16 +16,17 @@ from .cli import ( archive_cmds, ) from .parsers import ( - save_stdin_to_sources, - save_file_to_sources, + save_text_as_source, + save_file_as_source, ) from .index.schema import Link -from .util import enforce_types, docstring +from .util import enforce_types, docstring # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( links_after_timestamp, load_main_index, - import_new_links, + parse_links_from_source, + dedupe_links, write_main_index, link_matches_filter, get_indexed_folders, @@ -51,7 +51,7 @@ from .index.sql import ( apply_migrations, ) from .index.html import parse_html_main_index -from .extractors import archive_link +from .extractors import archive_links from .config import ( stderr, ConfigDict, @@ -91,9 +91,8 @@ from .config import ( from .cli.logging import ( TERM_WIDTH, TimedProgress, - log_archiving_started, - log_archiving_paused, - log_archiving_finished, + log_importing_started, + log_crawl_started, log_removal_started, log_removal_finished, log_list_started, @@ -496,59 +495,55 @@ def status(out_dir: str=OUTPUT_DIR) -> None: @enforce_types -def add(url: str, +def add(urls: Union[str, List[str]], depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + + # Load list of links from the existing index check_data_folder(out_dir=out_dir) - - base_path = save_stdin_to_sources(url, out_dir=out_dir) - if depth == 1: - depth_path = save_file_to_sources(url, out_dir=out_dir) check_dependencies() - - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) - all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir) - if depth == 1: - all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir) - new_links = new_links + new_links_depth + + log_importing_started(urls=urls, depth=depth, index_only=index_only) + if isinstance(urls, str): + # save verbatim stdin to sources + write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) + elif isinstance(urls, list): + # save verbatim args to sources + write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) + + new_links += parse_links_from_source(write_ahead_log) + all_links, new_links = dedupe_links(all_links, new_links) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) - # Step 2: Write updated index with deduped old and new links back to disk - write_main_index(links=all_links, out_dir=out_dir) + # If we're going one level deeper, download each link and look for more links + if new_links and depth == 1: + log_crawl_started(new_links) + for new_link in new_links: + downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir) + new_links += parse_links_from_source(downloaded_file) + all_links, new_links = dedupe_links(all_links, new_links) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links - - # Step 3: Run the archive methods for each link - links = all_links if update_all else new_links - log_archiving_started(len(links)) - idx: int = 0 - link: Link = None # type: ignore - try: - for idx, link in enumerate(links): - archive_link(link, out_dir=link.link_dir) - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) + # Run the archive methods for each link + to_archive = all_links if update_all else new_links + archive_links(to_archive, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) - write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + if to_archive: + all_links = load_main_index(out_dir=out_dir) + write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links @enforce_types @@ -671,23 +666,8 @@ def update(resume: Optional[float]=None, return all_links # Step 3: Run the archive methods for each link - links = new_links if only_new else all_links - log_archiving_started(len(links), resume) - idx: int = 0 - link: Link = None # type: ignore - try: - for idx, link in enumerate(links_after_timestamp(links, resume)): - archive_link(link, overwrite=overwrite, out_dir=link.link_dir) - - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) + to_archive = new_links if only_new else all_links + archive_links(to_archive, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 479d4e2c..eabaece2 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -29,7 +29,7 @@ from ..util import ( URL_REGEX, ) from ..index.schema import Link -from ..cli.logging import pretty_path, TimedProgress +from ..cli.logging import pretty_path, TimedProgress, log_source_saved from .pocket_html import parse_pocket_html_export from .pinboard_rss import parse_pinboard_rss_export from .shaarli_rss import parse_shaarli_rss_export @@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]: @enforce_types -def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - +def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str: ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) + source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts)) atomic_write(source_path, raw_text) + log_source_saved(source_file=source_path) return source_path @enforce_types -def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: +def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str: """download a given url's content into output/sources/domain-.txt""" - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) + source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts)) if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): # Source is a URL that needs to be downloaded - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) print('{}[*] [{}] Downloading {}{}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI atomic_write(source_path, raw_source_text) - print(' > {}'.format(pretty_path(source_path))) + log_source_saved(source_file=source_path) return source_path