fix depth flag and tweak logging

2025-05-13 14:44:29 -04:00 · 2020-07-13 11:26:30 -04:00 · 2020-07-13 11:26:30 -04:00 · d3bfa98a91
commit d3bfa98a91
parent 354a63ccd4
7 changed files with 156 additions and 127 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@ -106,9 +106,19 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,

    if command.help or command.subcommand is None:
        command.subcommand = 'help'
-    if command.version:
+    elif command.version:
        command.subcommand = 'version'
    
+    if command.subcommand not in ('help', 'version', 'status'):
+        from ..cli.logging import log_cli_command
+
+        log_cli_command(
+            subcommand=command.subcommand,
+            subcommand_args=command.subcommand_args,
+            stdin=stdin,
+            pwd=pwd or OUTPUT_DIR
+        )
+
    run_subcommand(
        subcommand=command.subcommand,
        subcommand_args=command.subcommand_args,
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@ -10,7 +10,7 @@ from typing import List, Optional, IO

 from ..main import add, docstring
 from ..config import OUTPUT_DIR, ONLY_NEW
-from .logging import SmartFormatter, accept_stdin
+from .logging import SmartFormatter, accept_stdin, stderr


@docstring(add.__doc__)
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        help="Add the links to the main index without archiving them",
    )
    parser.add_argument(
-        'import_path',
-        nargs='?',
+        'urls',
+        nargs='*',
        type=str,
        default=None,
        help=(
-            'URL or path to local file to start the archiving process from. e.g.:\n'
+            'URLs or paths to archive e.g.:\n'
            '    https://getpocket.com/users/USERNAME/feed/all\n'
            '    https://example.com/some/rss/feed.xml\n'
            '    https://example.com\n'
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        "--depth",
        action="store",
        default=0,
-        choices=[0,1],
+        choices=[0, 1],
        type=int,
        help="Recursively archive all linked pages up to this many hops away"
    )
    command = parser.parse_args(args or ())
-    import_string = accept_stdin(stdin)
-    if import_string and command.import_path:
+    urls = command.urls
+    stdin_urls = accept_stdin(stdin)
+    if (stdin_urls and urls) or (not stdin and not urls):
        stderr(
-            '[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
+            '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
            color='red',
        )
        raise SystemExit(2)
-    elif import_string:
-        import_path = import_string
-    else:
-        import_path = command.import_path
-
    add(
-        url=import_path,
+        urls=stdin_urls or urls,
        depth=command.depth,
        update_all=command.update_all,
        index_only=command.index_only,
--- a/archivebox/cli/logging.py
+++ b/archivebox/cli/logging.py
@ -5,10 +5,12 @@ import os
 import sys
 import time
 import argparse
+import logging
+import signal
+from multiprocessing import Process

 from datetime import datetime
 from dataclasses import dataclass
-from multiprocessing import Process
 from typing import Optional, List, Dict, Union, IO

 from ..index.schema import Link, ArchiveResult
@ -23,11 +25,11 @@ from ..config import (
    SHOW_PROGRESS,
    TERM_WIDTH,
    OUTPUT_DIR,
+    SOURCES_DIR_NAME,
    HTML_INDEX_FILENAME,
    stderr,
 )

-
@dataclass
 class RuntimeStats:
    """mutable stats counter for logging archiving timing info to CLI output"""
@ -98,9 +100,9 @@ class TimedProgress:
        
        if SHOW_PROGRESS:
            # terminate if we havent already terminated
-            if self.p is not None:
            self.p.terminate()
-                self.p = None
+            self.p.join()
+            self.p.close()

            # clear whole terminal line
            try:
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
            seconds,
        ))
        sys.stdout.flush()
-    except KeyboardInterrupt:
+    except (KeyboardInterrupt, BrokenPipeError):
        print()
        pass


+def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
+    from ..config import VERSION, ANSI
+    cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
+    stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
+    print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
+        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        VERSION=VERSION,
+        cmd=cmd,
+        stdin_hint=stdin_hint,
+        **ANSI,
+    ))
+    print('{black}    > {pwd}{reset}'.format(pwd=pwd, **ANSI))
+    print()
+
 ### Parsing Stage

-def log_parsing_started(source_file: str):
-    start_ts = datetime.now()
-    _LAST_RUN_STATS.parse_start_ts = start_ts
-    print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
-        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        source_file.rsplit('/', 1)[-1],
+
+def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
+    _LAST_RUN_STATS.parse_start_ts = datetime.now()
+    print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
+        _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        len(urls) if isinstance(urls, list) else len(urls.split('\n')),
+        depth,
+        ' (index only)' if index_only else '',
        **ANSI,
    ))

+def log_source_saved(source_file: str):
+    print('    > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))

-def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
-    end_ts = datetime.now()
-    _LAST_RUN_STATS.parse_end_ts = end_ts
-    print('    > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
+def log_parsing_finished(num_parsed: int, parser_name: str):
+    _LAST_RUN_STATS.parse_end_ts = datetime.now()
+    print('    > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))

+def log_deduping_finished(num_new_links: int):
+    print('    > Found {} new URLs not already in index'.format(num_new_links))
+
+
+def log_crawl_started(new_links):
+    print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))

 ### Indexing Stage

@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
    start_ts = datetime.now()
    _LAST_RUN_STATS.index_start_ts = start_ts
    print()
-    print('{green}[*] [{}] Writing {} links to main index...{reset}'.format(
+    print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
        num_links,
        **ANSI,
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
             **ANSI,
        ))
    else:
-        print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
+        print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
             num_links,
             **ANSI,
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors'

 import os

-from typing import Optional
+from typing import Optional, List
 from datetime import datetime

 from ..index.schema import Link
@ -13,6 +13,9 @@ from ..index import (
 )
 from ..util import enforce_types
 from ..cli.logging import (
+    log_archiving_started,
+    log_archiving_paused,
+    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None)
        raise

    return link
+
+
+@enforce_types
+def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]:
+    if not links:
+        return []
+
+    log_archiving_started(len(links))
+    idx: int = 0
+    link: Link = links[0]
+    try:
+        for idx, link in enumerate(links):
+            archive_link(link, out_dir=link.link_dir)
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link.timestamp)
+        raise SystemExit(0)
+    except BaseException:
+        print()
+        raise
+
+    log_archiving_finished(len(links))
+    return links
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -33,8 +33,8 @@ from ..cli.logging import (
    log_indexing_process_finished,
    log_indexing_started,
    log_indexing_finished,
-    log_parsing_started,
    log_parsing_finished,
+    log_deduping_finished,
 )

 from .schema import Link, ArchiveResult
@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:

    return None

+
@enforce_types
-def import_new_links(existing_links: List[Link],
-                     import_path: str,
-                     out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
+def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:

    from ..parsers import parse_links

    new_links: List[Link] = []

    # parse and validate the import file
-    log_parsing_started(import_path)
-    raw_links, parser_name = parse_links(import_path)
+    raw_links, parser_name = parse_links(source_path)
    new_links = validate_links(raw_links)

+    if parser_name:
+        num_parsed = len(raw_links)
+        log_parsing_finished(num_parsed, parser_name)
+
+    return new_links
+
+
+@enforce_types
+def dedupe_links(existing_links: List[Link],
+                 new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
+
+    from ..parsers import parse_links
+
    # merge existing links in out_dir and new links
    all_links = validate_links(existing_links + new_links)
    all_link_urls = {link.url for link in existing_links}
@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link],
        link for link in new_links
        if link.url not in all_link_urls
    ]
-
-    if parser_name:
-        num_parsed = len(raw_links)
-        num_new_links = len(all_links) - len(existing_links)
-        log_parsing_finished(num_parsed, num_new_links, parser_name)
+    log_deduping_finished(len(new_links))

    return all_links, new_links

--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -4,8 +4,7 @@ import os
 import sys
 import shutil

-from typing import Dict, List, Optional, Iterable, IO
-
+from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices

 from .cli import (
@ -17,16 +16,17 @@ from .cli import (
    archive_cmds,
 )
 from .parsers import (
-    save_stdin_to_sources,
-    save_file_to_sources,
+    save_text_as_source,
+    save_file_as_source,
 )
 from .index.schema import Link
-from .util import enforce_types, docstring
+from .util import enforce_types, docstring                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
    links_after_timestamp,
    load_main_index,
-    import_new_links,
+    parse_links_from_source,
+    dedupe_links,
    write_main_index,
    link_matches_filter,
    get_indexed_folders,
@ -51,7 +51,7 @@ from .index.sql import (
    apply_migrations,
 )
 from .index.html import parse_html_main_index
-from .extractors import archive_link
+from .extractors import archive_links
 from .config import (
    stderr,
    ConfigDict,
@ -91,9 +91,8 @@ from .config import (
 from .cli.logging import (
    TERM_WIDTH,
    TimedProgress,
-    log_archiving_started,
-    log_archiving_paused,
-    log_archiving_finished,
+    log_importing_started,
+    log_crawl_started,
    log_removal_started,
    log_removal_finished,
    log_list_started,
@ -496,57 +495,53 @@ def status(out_dir: str=OUTPUT_DIR) -> None:


@enforce_types
-def add(url: str,
+def add(urls: Union[str, List[str]],
        depth: int=0,
        update_all: bool=not ONLY_NEW,
        index_only: bool=False,
        out_dir: str=OUTPUT_DIR) -> List[Link]:
    """Add a new URL or list of URLs to your archive"""

+    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
+
+    # Load list of links from the existing index
    check_data_folder(out_dir=out_dir)
-
-    base_path = save_stdin_to_sources(url, out_dir=out_dir)
-    if depth == 1:
-        depth_path = save_file_to_sources(url, out_dir=out_dir)
    check_dependencies()
-
-    # Step 1: Load list of links from the existing index
-    #         merge in and dedupe new links from import_path
    all_links: List[Link] = []
    new_links: List[Link] = []
    all_links = load_main_index(out_dir=out_dir)
-    all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir)
-    if depth == 1:
-        all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir)
-        new_links = new_links + new_links_depth
+
+    log_importing_started(urls=urls, depth=depth, index_only=index_only)
+    if isinstance(urls, str):
+        # save verbatim stdin to sources
+        write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
+    elif isinstance(urls, list):
+        # save verbatim args to sources
+        write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
+    
+    new_links += parse_links_from_source(write_ahead_log)
+    all_links, new_links = dedupe_links(all_links, new_links)
+    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)


-    # Step 2: Write updated index with deduped old and new links back to disk
-    write_main_index(links=all_links, out_dir=out_dir)
+    # If we're going one level deeper, download each link and look for more links
+    if new_links and depth == 1:
+        log_crawl_started(new_links)
+        for new_link in new_links:
+            downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
+            new_links += parse_links_from_source(downloaded_file)
+            all_links, new_links = dedupe_links(all_links, new_links)
+            write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)

    if index_only:
        return all_links

-    # Step 3: Run the archive methods for each link
-    links = all_links if update_all else new_links
-    log_archiving_started(len(links))
-    idx: int = 0
-    link: Link = None                                             # type: ignore
-    try:
-        for idx, link in enumerate(links):
-            archive_link(link, out_dir=link.link_dir)
-
-    except KeyboardInterrupt:
-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
-        raise SystemExit(0)
-
-    except:
-        print()
-        raise    
-
-    log_archiving_finished(len(links))
+    # Run the archive methods for each link
+    to_archive = all_links if update_all else new_links
+    archive_links(to_archive, out_dir=out_dir)

    # Step 4: Re-write links index with updated titles, icons, and resources
+    if to_archive:
        all_links = load_main_index(out_dir=out_dir)
        write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
    return all_links
@ -671,23 +666,8 @@ def update(resume: Optional[float]=None,
        return all_links
        
    # Step 3: Run the archive methods for each link
-    links = new_links if only_new else all_links
-    log_archiving_started(len(links), resume)
-    idx: int = 0
-    link: Link = None                                             # type: ignore
-    try:
-        for idx, link in enumerate(links_after_timestamp(links, resume)):
-            archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
-
-    except KeyboardInterrupt:
-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
-        raise SystemExit(0)
-
-    except:
-        print()
-        raise    
-
-    log_archiving_finished(len(links))
+    to_archive = new_links if only_new else all_links
+    archive_links(to_archive, out_dir=out_dir)

    # Step 4: Re-write links index with updated titles, icons, and resources
    all_links = load_main_index(out_dir=out_dir)
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -29,7 +29,7 @@ from ..util import (
    URL_REGEX,
 )
 from ..index.schema import Link
-from ..cli.logging import pretty_path, TimedProgress
+from ..cli.logging import pretty_path, TimedProgress, log_source_saved
 from .pocket_html import parse_pocket_html_export
 from .pinboard_rss import parse_pinboard_rss_export
 from .shaarli_rss import parse_shaarli_rss_export
@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:


@enforce_types
-def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
-    check_data_folder(out_dir=out_dir)
-
-    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
-    if not os.path.exists(sources_dir):
-        os.makedirs(sources_dir)
-
+def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
-    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
+    source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
    atomic_write(source_path, raw_text)
+    log_source_saved(source_file=source_path)
    return source_path


@enforce_types
-def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
+def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
    """download a given url's content into output/sources/domain-<timestamp>.txt"""
-    check_data_folder(out_dir=out_dir)
-
-    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
-    if not os.path.exists(sources_dir):
-        os.makedirs(sources_dir)
-
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
-    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
+    source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))

    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        # Source is a URL that needs to be downloaded
-        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
        print('{}[*] [{}] Downloading {}{}'.format(
            ANSI['green'],
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI

    atomic_write(source_path, raw_source_text)

-    print('    > {}'.format(pretty_path(source_path)))
+    log_source_saved(source_file=source_path)

    return source_path