major codebase-wide code cleanups

2025-05-25 12:14:32 -04:00 · 2019-03-21 01:28:12 -04:00 · 2019-03-21 01:28:12 -04:00 · e6bd1f8ca8
commit e6bd1f8ca8
parent c806068683
8 changed files with 825 additions and 743 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -1,225 +1,132 @@
 #!/usr/bin/env python3
-# ArchiveBox
-# Nick Sweeting 2017 | MIT License
-# https://github.com/pirate/ArchiveBox
+"""
+ArchiveBox command line application.
+
+./archive and ./bin/archivebox both point to this file, 
+but you can also run it directly using `python3 archive.py`
+
+Usage & Documentation:
+    https://github.com/pirate/ArchiveBox/Wiki
+"""

 import os
 import sys

-from datetime import datetime
-from peekable import Peekable
-
-
-from parse import parse_links
-from links import validate_links, links_after_timestamp
-from archive_methods import archive_link, _RESULTS_TOTALS
-from index import (
-    write_links_index,
-    parse_json_links_index,
-)
+from links import links_after_timestamp
+from index import write_links_index, load_links_index
+from archive_methods import archive_link
 from config import (
    ARCHIVE_DIR,
    ONLY_NEW,
    OUTPUT_DIR,
-    REPO_DIR,
-    ANSI,
    GIT_SHA,
 )
 from util import (
    check_dependencies,
    save_remote_source,
    save_stdin_source,
-    pretty_path,
-    check_links_structure,
+)
+from logs import (
+    log_archiving_started,
+    log_archiving_paused,
+    log_archiving_finished,
 )

 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
 __VERSION__ = GIT_SHA
-__DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
+__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'


 def print_help():
-    print(__DESCRIPTION__)
-    print("Documentation:     {}\n".format(__DOCUMENTATION__))
+    print('ArchiveBox: The self-hosted internet archive.\n')
+    print("Documentation:")
+    print("    https://github.com/pirate/ArchiveBox/wiki\n")
    print("Usage:")
-    print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
-    print("")
-    print("    ./bin/archivebox https://example.com/feed.rss\n")
-    print("")
    print("    echo 'https://examplecom' | ./bin/archivebox\n")
+    print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
+    print("    ./bin/archivebox https://example.com/feed.rss\n")
+    print("    ./bin/archivebox 15109948213.123\n")


-def load_links(archive_path=OUTPUT_DIR, import_path=None):
-    """get new links from file and optionally append them to links in existing archive"""
-
-    existing_links = []
-    if archive_path:
-        existing_links = parse_json_links_index(archive_path)
-        check_links_structure(existing_links)
-
-    new_links = []
-    if import_path:
-        # parse and validate the import file
-        raw_links, parser_name = parse_links(import_path)
-        new_links = validate_links(raw_links)
-        check_links_structure(new_links)
-
-    # merge existing links in archive_path and new links
-    all_links = validate_links(existing_links + new_links)
-    check_links_structure(all_links)
-    num_new_links = len(all_links) - len(existing_links)
-
-    if import_path and parser_name:
-        print('    > Adding {} new links to index (parsed import as {})'.format(
-            num_new_links,
-            parser_name,
-        ))
-
-    return all_links, new_links
-
-
-def update_archive(archive_path, links, source=None, resume=None, append=True):
-    """update or create index.html+json given a path to an export file containing new links"""
-
-    start_ts = datetime.now().timestamp()
-
-    if resume:
-        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-             resume,
-             **ANSI,
-        ))
-    else:
-        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-             len(links),
-             **ANSI,
-        ))
-
-    check_links_structure(links)
-
-    # prefetch the first link off the generator so that if we pause or fail
-    # immediately we can show that we paused on the first link and not just None
-    to_archive = Peekable(links_after_timestamp(links, resume))
-    idx, link = 0, to_archive.peek(0)
-
-    # loop over links and archive them
-    try:
-        check_dependencies()
-        for idx, link in enumerate(to_archive):
-            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
-            archive_link(link_dir, link)
-
-    except (KeyboardInterrupt, SystemExit, Exception) as e:
-        # if isinstance(e, KeyboardInterrupt):
-        #     # Step 4: Re-write links index with updated titles, icons, and resources
-        #     all_links, _ = load_links(archive_path=out_dir)
-        #     write_links_index(out_dir=out_dir, links=all_links, finished=True)
-        print()
-        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
-            **ANSI,
-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            idx=idx+1,
-            timestamp=link['timestamp'],
-            total=len(links),
-        ))
-        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
-        print('    Continue where you left off by running:')
-        print('        {} {}'.format(
-            pretty_path(sys.argv[0]),
-            link['timestamp'],
-        ))
-        if not isinstance(e, KeyboardInterrupt):
-            print()
-            raise e
-        raise SystemExit(1)
-
-    # print timing information & summary
-    end_ts = datetime.now().timestamp()
-    seconds = end_ts - start_ts
-    if seconds > 60:
-        duration = '{0:.2f} min'.format(seconds / 60, 2)
-    else:
-        duration = '{0:.2f} sec'.format(seconds, 2)
-
-    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
-        ANSI['green'],
-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        len(links),
-        duration,
-        ANSI['reset'],
-    ))
-    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
-    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
-    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
-    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
-
-
-if __name__ == '__main__':
-    argc = len(sys.argv)
-
-    if set(sys.argv).intersection(('-h', '--help', 'help')):
+def main(*args):
+    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
        print_help()
        raise SystemExit(0)

-    source = sys.argv[1] if argc > 1 else None  # path of links file to import
-    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
-   
-    stdin_raw_text = ''
+    ### Handle CLI arguments
+    #     ./archive bookmarks.html
+    #     ./archive 1523422111.234
+    import_path, resume = None, None
+    if len(args) == 2:
+        # if the argument is a string, it's a import_path file to import
+        # if it's a number, it's a timestamp to resume archiving from
+        if args[1].replace('.', '').isdigit():
+            import_path, resume = None, args[1]
+        else:
+            import_path, resume = args[1], None

+    ### Set up output folder
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+
+    ### Handle ingesting urls piped in through stdin
+    # (.e.g if user does cat example_urls.txt | ./archive)
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
+        if stdin_raw_text and import_path:
+            print(
+                '[X] You should pass either a path as an argument, '
+                'or pass a list of links via stdin, but not both.\n'
+            )
+            print_help()
+            raise SystemExit(1)

-    if source and stdin_raw_text:
-        print(
-            '[X] You should pass either a path as an argument, '
-            'or pass a list of links via stdin, but not both.\n'
-        )
-        print_help()
-        raise SystemExit(1)
+        import_path = save_stdin_source(stdin_raw_text)
+
+    ### Handle ingesting urls from a remote file/feed
+    # (e.g. if an RSS feed URL is used as the import path) 
+    if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+        import_path = save_remote_source(import_path)
+
+    ### Run the main archive update process
+    update_archive_data(import_path=import_path, resume=resume)


-    if argc == 1:
-        source, resume = None, None
-    elif argc == 2:
-        if all(d.isdigit() for d in sys.argv[1].split('.')):
-            # argv[1] is a resume timestamp
-            source, resume = None, sys.argv[1]
-        else:
-            # argv[1] is a path to a file to import
-            source, resume = sys.argv[1].strip(), None
-    elif argc == 3:
-        source, resume = sys.argv[1].strip(), sys.argv[2]
-    else:
-        print_help()
-        raise SystemExit(1)
+def update_archive_data(import_path=None, resume=None):
+    """The main ArchiveBox entrancepoint.  Everything starts here."""
+    check_dependencies()

-    # See if archive folder already exists
-    for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
-        if os.path.exists(out_dir):
-            break
-    else:
-        out_dir = OUTPUT_DIR
+    # Step 1: Load list of links from the existing index
+    #         merge in and dedupe new links from import_path
+    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)

-    # Step 0: Download url to local file (only happens if a URL is specified instead of local path) 
-    if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        source = save_remote_source(source)
-    elif stdin_raw_text:
-        source = save_stdin_source(stdin_raw_text)
-
-    # Step 1: Parse the links and dedupe them with existing archive
-    all_links, new_links = load_links(archive_path=out_dir, import_path=source)
-
-    # Step 2: Write new index
-    write_links_index(out_dir=out_dir, links=all_links)
+    # Step 2: Write updated index with deduped old and new links back to disk
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links)

    # Step 3: Run the archive methods for each link
-    if ONLY_NEW:
-        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
-    else:
-        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
+    links = new_links if ONLY_NEW else all_links
+    log_archiving_started(len(links), resume)
+    idx, link = 0, 0
+    try:
+        for idx, link in enumerate(links_after_timestamp(links, resume)):
+            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
+            archive_link(link_dir, link)
+
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link and link['timestamp'])
+        raise SystemExit(0)
+
+    except:
+        print()
+        raise    
+
+    log_archiving_finished(len(links))

    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_links(archive_path=out_dir)
-    write_links_index(out_dir=out_dir, links=all_links, finished=True)
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
+
+
+if __name__ == '__main__':
+    main(*sys.argv)