fix depth flag and tweak logging

2025-05-13 14:44:29 -04:00 · 2020-07-13 11:26:30 -04:00 · 2020-07-13 11:26:30 -04:00 · d3bfa98a91
commit d3bfa98a91
parent 354a63ccd4
7 changed files with 156 additions and 127 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
    if command.help or command.subcommand is None:
        command.subcommand = 'help'
-    if command.version:
+    elif command.version:
        command.subcommand = 'version'
    if command.subcommand not in ('help', 'version', 'status'):
        from ..cli.logging import log_cli_command
        log_cli_command(
            subcommand=command.subcommand,
            subcommand_args=command.subcommand_args,
            stdin=stdin,
            pwd=pwd or OUTPUT_DIR
        )
    run_subcommand(
        subcommand=command.subcommand,
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -10,7 +10,7 @@ from typing import List, Optional, IO
 from ..main import add, docstring
 from ..config import OUTPUT_DIR, ONLY_NEW
-from .logging import SmartFormatter, accept_stdin
+from .logging import SmartFormatter, accept_stdin, stderr
@docstring(add.__doc__)
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        help="Add the links to the main index without archiving them",
    )
    parser.add_argument(
-        'import_path',
+        'urls',
-        nargs='?',
+        nargs='*',
        type=str,
        default=None,
        help=(
-            'URL or path to local file to start the archiving process from. e.g.:\n'
+            'URLs or paths to archive e.g.:\n'
            '    https://getpocket.com/users/USERNAME/feed/all\n'
            '    https://example.com/some/rss/feed.xml\n'
            '    https://example.com\n'
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        "--depth",
        action="store",
        default=0,
-        choices=[0,1],
+        choices=[0, 1],
        type=int,
        help="Recursively archive all linked pages up to this many hops away"
    )
    command = parser.parse_args(args or ())
-    import_string = accept_stdin(stdin)
+    urls = command.urls
-    if import_string and command.import_path:
+    stdin_urls = accept_stdin(stdin)
    if (stdin_urls and urls) or (not stdin and not urls):
        stderr(
-            '[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
+            '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
            color='red',
        )
        raise SystemExit(2)
    elif import_string:
        import_path = import_string
    else:
        import_path = command.import_path
    add(
-        url=import_path,
+        urls=stdin_urls or urls,
        depth=command.depth,
        update_all=command.update_all,
        index_only=command.index_only,
--- a/archivebox/cli/logging.py
+++ b/archivebox/cli/logging.py
@ -5,10 +5,12 @@ import os
 import sys
 import time
 import argparse
 import logging
 import signal
 from multiprocessing import Process
 from datetime import datetime
 from dataclasses import dataclass
 from multiprocessing import Process
 from typing import Optional, List, Dict, Union, IO
 from ..index.schema import Link, ArchiveResult
@ -23,11 +25,11 @@ from ..config import (
    SHOW_PROGRESS,
    TERM_WIDTH,
    OUTPUT_DIR,
    SOURCES_DIR_NAME,
    HTML_INDEX_FILENAME,
    stderr,
 )
@dataclass
 class RuntimeStats:
    """mutable stats counter for logging archiving timing info to CLI output"""
@ -98,9 +100,9 @@ class TimedProgress:
        if SHOW_PROGRESS:
            # terminate if we havent already terminated
-            if self.p is not None:
+            self.p.terminate()
-                self.p.terminate()
+            self.p.join()
-                self.p = None
+            self.p.close()
            # clear whole terminal line
            try:
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
            seconds,
        ))
        sys.stdout.flush()
-    except KeyboardInterrupt:
+    except (KeyboardInterrupt, BrokenPipeError):
        print()
        pass
 def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
    from ..config import VERSION, ANSI
    cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
    stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
    print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        VERSION=VERSION,
        cmd=cmd,
        stdin_hint=stdin_hint,
        **ANSI,
    ))
    print('{black}    > {pwd}{reset}'.format(pwd=pwd, **ANSI))
    print()
 ### Parsing Stage
-def log_parsing_started(source_file: str):
+
-    start_ts = datetime.now()
+def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
-    _LAST_RUN_STATS.parse_start_ts = start_ts
+    _LAST_RUN_STATS.parse_start_ts = datetime.now()
-    print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
+    print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
-        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        source_file.rsplit('/', 1)[-1],
+        len(urls) if isinstance(urls, list) else len(urls.split('\n')),
        depth,
        ' (index only)' if index_only else '',
        **ANSI,
    ))
 def log_source_saved(source_file: str):
    print('    > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
-def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
+def log_parsing_finished(num_parsed: int, parser_name: str):
-    end_ts = datetime.now()
+    _LAST_RUN_STATS.parse_end_ts = datetime.now()
-    _LAST_RUN_STATS.parse_end_ts = end_ts
+    print('    > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
    print('    > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
 def log_deduping_finished(num_new_links: int):
    print('    > Found {} new URLs not already in index'.format(num_new_links))
 def log_crawl_started(new_links):
    print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
 ### Indexing Stage
@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
    start_ts = datetime.now()
    _LAST_RUN_STATS.index_start_ts = start_ts
    print()
-    print('{green}[*] [{}] Writing {} links to main index...{reset}'.format(
+    print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
        num_links,
        **ANSI,
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
             **ANSI,
        ))
    else:
-        print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
+        print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
             num_links,
             **ANSI,
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors'
 import os
-from typing import Optional
+from typing import Optional, List
 from datetime import datetime
 from ..index.schema import Link
@ -13,6 +13,9 @@ from ..index import (
 )
 from ..util import enforce_types
 from ..cli.logging import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None)
        raise
    return link
@enforce_types
 def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]:
    if not links:
        return []
    log_archiving_started(len(links))
    idx: int = 0
    link: Link = links[0]
    try:
        for idx, link in enumerate(links):
            archive_link(link, out_dir=link.link_dir)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link.timestamp)
        raise SystemExit(0)
    except BaseException:
        print()
        raise
    log_archiving_finished(len(links))
    return links
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -33,8 +33,8 @@ from ..cli.logging import (
    log_indexing_process_finished,
    log_indexing_started,
    log_indexing_finished,
    log_parsing_started,
    log_parsing_finished,
    log_deduping_finished,
 )
 from .schema import Link, ArchiveResult
@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
    return None
@enforce_types
-def import_new_links(existing_links: List[Link],
+def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:
                     import_path: str,
                     out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
    from ..parsers import parse_links
    new_links: List[Link] = []
    # parse and validate the import file
-    log_parsing_started(import_path)
+    raw_links, parser_name = parse_links(source_path)
    raw_links, parser_name = parse_links(import_path)
    new_links = validate_links(raw_links)
    if parser_name:
        num_parsed = len(raw_links)
        log_parsing_finished(num_parsed, parser_name)
    return new_links
@enforce_types
 def dedupe_links(existing_links: List[Link],
                 new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
    from ..parsers import parse_links
    # merge existing links in out_dir and new links
    all_links = validate_links(existing_links + new_links)
    all_link_urls = {link.url for link in existing_links}
@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link],
        link for link in new_links
        if link.url not in all_link_urls
    ]
-
+    log_deduping_finished(len(new_links))
    if parser_name:
        num_parsed = len(raw_links)
        num_new_links = len(all_links) - len(existing_links)
        log_parsing_finished(num_parsed, num_new_links, parser_name)
    return all_links, new_links
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -4,8 +4,7 @@ import os
 import sys
 import shutil
-from typing import Dict, List, Optional, Iterable, IO
+from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices
 from .cli import (
@ -17,16 +16,17 @@ from .cli import (
    archive_cmds,
 )
 from .parsers import (
-    save_stdin_to_sources,
+    save_text_as_source,
-    save_file_to_sources,
+    save_file_as_source,
 )
 from .index.schema import Link
-from .util import enforce_types, docstring
+from .util import enforce_types, docstring                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
    links_after_timestamp,
    load_main_index,
-    import_new_links,
+    parse_links_from_source,
    dedupe_links,
    write_main_index,
    link_matches_filter,
    get_indexed_folders,
@ -51,7 +51,7 @@ from .index.sql import (
    apply_migrations,
 )
 from .index.html import parse_html_main_index
-from .extractors import archive_link
+from .extractors import archive_links
 from .config import (
    stderr,
    ConfigDict,
@ -91,9 +91,8 @@ from .config import (
 from .cli.logging import (
    TERM_WIDTH,
    TimedProgress,
-    log_archiving_started,
+    log_importing_started,
-    log_archiving_paused,
+    log_crawl_started,
    log_archiving_finished,
    log_removal_started,
    log_removal_finished,
    log_list_started,
@ -496,59 +495,55 @@ def status(out_dir: str=OUTPUT_DIR) -> None:
@enforce_types
-def add(url: str,
+def add(urls: Union[str, List[str]],
        depth: int=0,
        update_all: bool=not ONLY_NEW,
        index_only: bool=False,
        out_dir: str=OUTPUT_DIR) -> List[Link]:
    """Add a new URL or list of URLs to your archive"""
    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
    # Load list of links from the existing index
    check_data_folder(out_dir=out_dir)
    base_path = save_stdin_to_sources(url, out_dir=out_dir)
    if depth == 1:
        depth_path = save_file_to_sources(url, out_dir=out_dir)
    check_dependencies()
    # Step 1: Load list of links from the existing index
    #         merge in and dedupe new links from import_path
    all_links: List[Link] = []
    new_links: List[Link] = []
    all_links = load_main_index(out_dir=out_dir)
-    all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir)
+
-    if depth == 1:
+    log_importing_started(urls=urls, depth=depth, index_only=index_only)
-        all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir)
+    if isinstance(urls, str):
-        new_links = new_links + new_links_depth
+        # save verbatim stdin to sources
        write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
    elif isinstance(urls, list):
        # save verbatim args to sources
        write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
    new_links += parse_links_from_source(write_ahead_log)
    all_links, new_links = dedupe_links(all_links, new_links)
    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
-    # Step 2: Write updated index with deduped old and new links back to disk
+    # If we're going one level deeper, download each link and look for more links
-    write_main_index(links=all_links, out_dir=out_dir)
+    if new_links and depth == 1:
        log_crawl_started(new_links)
        for new_link in new_links:
            downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
            new_links += parse_links_from_source(downloaded_file)
            all_links, new_links = dedupe_links(all_links, new_links)
            write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
    if index_only:
        return all_links
    # Step 3: Run the archive methods for each link
    links = all_links if update_all else new_links
    log_archiving_started(len(links))
    idx: int = 0
    link: Link = None                                             # type: ignore
    try:
        for idx, link in enumerate(links):
            archive_link(link, out_dir=link.link_dir)
-    except KeyboardInterrupt:
+    # Run the archive methods for each link
-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+    to_archive = all_links if update_all else new_links
-        raise SystemExit(0)
+    archive_links(to_archive, out_dir=out_dir)
    except:
        print()
        raise    
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links = load_main_index(out_dir=out_dir)
+    if to_archive:
-    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+        all_links = load_main_index(out_dir=out_dir)
        write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
    return all_links
@enforce_types
@ -671,23 +666,8 @@ def update(resume: Optional[float]=None,
        return all_links
    # Step 3: Run the archive methods for each link
-    links = new_links if only_new else all_links
+    to_archive = new_links if only_new else all_links
-    log_archiving_started(len(links), resume)
+    archive_links(to_archive, out_dir=out_dir)
    idx: int = 0
    link: Link = None                                             # type: ignore
    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
        raise SystemExit(0)
    except:
        print()
        raise    
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links = load_main_index(out_dir=out_dir)
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -29,7 +29,7 @@ from ..util import (
    URL_REGEX,
 )
 from ..index.schema import Link
-from ..cli.logging import pretty_path, TimedProgress
+from ..cli.logging import pretty_path, TimedProgress, log_source_saved
 from .pocket_html import parse_pocket_html_export
 from .pinboard_rss import parse_pinboard_rss_export
 from .shaarli_rss import parse_shaarli_rss_export
@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
@enforce_types
-def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
+def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
    check_data_folder(out_dir=out_dir)
    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
    if not os.path.exists(sources_dir):
        os.makedirs(sources_dir)
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
+    source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
    atomic_write(source_path, raw_text)
    log_source_saved(source_file=source_path)
    return source_path
@enforce_types
-def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
+def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
    """download a given url's content into output/sources/domain-<timestamp>.txt"""
    check_data_folder(out_dir=out_dir)
    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
    if not os.path.exists(sources_dir):
        os.makedirs(sources_dir)
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
+    source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        # Source is a URL that needs to be downloaded
        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
        print('{}[*] [{}] Downloading {}{}'.format(
            ANSI['green'],
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
    atomic_write(source_path, raw_source_text)
-    print('    > {}'.format(pretty_path(source_path)))
+    log_source_saved(source_file=source_path)
    return source_path