major codebase-wide code cleanups

2025-05-13 06:34:25 -04:00 · 2019-03-21 01:28:12 -04:00 · 2019-03-21 01:28:12 -04:00 · e6bd1f8ca8
commit e6bd1f8ca8
parent c806068683
8 changed files with 825 additions and 743 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -1,225 +1,132 @@
 #!/usr/bin/env python3
-# ArchiveBox
+"""
-# Nick Sweeting 2017 | MIT License
+ArchiveBox command line application.
-# https://github.com/pirate/ArchiveBox
+
 ./archive and ./bin/archivebox both point to this file, 
 but you can also run it directly using `python3 archive.py`
 Usage & Documentation:
    https://github.com/pirate/ArchiveBox/Wiki
 """
 import os
 import sys
-from datetime import datetime
+from links import links_after_timestamp
-from peekable import Peekable
+from index import write_links_index, load_links_index
-
+from archive_methods import archive_link
 from parse import parse_links
 from links import validate_links, links_after_timestamp
 from archive_methods import archive_link, _RESULTS_TOTALS
 from index import (
    write_links_index,
    parse_json_links_index,
 )
 from config import (
    ARCHIVE_DIR,
    ONLY_NEW,
    OUTPUT_DIR,
    REPO_DIR,
    ANSI,
    GIT_SHA,
 )
 from util import (
    check_dependencies,
    save_remote_source,
    save_stdin_source,
-    pretty_path,
+)
-    check_links_structure,
+from logs import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
 )
 __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
 __VERSION__ = GIT_SHA
-__DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
+__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 def print_help():
-    print(__DESCRIPTION__)
+    print('ArchiveBox: The self-hosted internet archive.\n')
-    print("Documentation:     {}\n".format(__DOCUMENTATION__))
+    print("Documentation:")
    print("    https://github.com/pirate/ArchiveBox/wiki\n")
    print("Usage:")
    print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
    print("")
    print("    ./bin/archivebox https://example.com/feed.rss\n")
    print("")
    print("    echo 'https://examplecom' | ./bin/archivebox\n")
    print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
    print("    ./bin/archivebox https://example.com/feed.rss\n")
    print("    ./bin/archivebox 15109948213.123\n")
-def load_links(archive_path=OUTPUT_DIR, import_path=None):
+def main(*args):
-    """get new links from file and optionally append them to links in existing archive"""
+    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        check_links_structure(existing_links)
    new_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)
    # merge existing links in archive_path and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)
    if import_path and parser_name:
        print('    > Adding {} new links to index (parsed import as {})'.format(
            num_new_links,
            parser_name,
        ))
    return all_links, new_links
 def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""
    start_ts = datetime.now().timestamp()
    if resume:
        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             resume,
             **ANSI,
        ))
    else:
        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             len(links),
             **ANSI,
        ))
    check_links_structure(links)
    # prefetch the first link off the generator so that if we pause or fail
    # immediately we can show that we paused on the first link and not just None
    to_archive = Peekable(links_after_timestamp(links, resume))
    idx, link = 0, to_archive.peek(0)
    # loop over links and archive them
    try:
        check_dependencies()
        for idx, link in enumerate(to_archive):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)
    except (KeyboardInterrupt, SystemExit, Exception) as e:
        # if isinstance(e, KeyboardInterrupt):
        #     # Step 4: Re-write links index with updated titles, icons, and resources
        #     all_links, _ = load_links(archive_path=out_dir)
        #     write_links_index(out_dir=out_dir, links=all_links, finished=True)
        print()
        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
            **ANSI,
            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            idx=idx+1,
            timestamp=link['timestamp'],
            total=len(links),
        ))
        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
        print('    Continue where you left off by running:')
        print('        {} {}'.format(
            pretty_path(sys.argv[0]),
            link['timestamp'],
        ))
        if not isinstance(e, KeyboardInterrupt):
            print()
            raise e
        raise SystemExit(1)
    # print timing information & summary
    end_ts = datetime.now().timestamp()
    seconds = end_ts - start_ts
    if seconds > 60:
        duration = '{0:.2f} min'.format(seconds / 60, 2)
    else:
        duration = '{0:.2f} sec'.format(seconds, 2)
    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
        ANSI['green'],
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        len(links),
        duration,
        ANSI['reset'],
    ))
    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
 if __name__ == '__main__':
    argc = len(sys.argv)
    if set(sys.argv).intersection(('-h', '--help', 'help')):
        print_help()
        raise SystemExit(0)
-    source = sys.argv[1] if argc > 1 else None  # path of links file to import
+    ### Handle CLI arguments
-    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
+    #     ./archive bookmarks.html
-   
+    #     ./archive 1523422111.234
-    stdin_raw_text = ''
+    import_path, resume = None, None
    if len(args) == 2:
        # if the argument is a string, it's a import_path file to import
        # if it's a number, it's a timestamp to resume archiving from
        if args[1].replace('.', '').isdigit():
            import_path, resume = None, args[1]
        else:
            import_path, resume = args[1], None
    ### Set up output folder
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    ### Handle ingesting urls piped in through stdin
    # (.e.g if user does cat example_urls.txt | ./archive)
    if not sys.stdin.isatty():
        stdin_raw_text = sys.stdin.read()
        if stdin_raw_text and import_path:
            print(
                '[X] You should pass either a path as an argument, '
                'or pass a list of links via stdin, but not both.\n'
            )
            print_help()
            raise SystemExit(1)
-    if source and stdin_raw_text:
+        import_path = save_stdin_source(stdin_raw_text)
-        print(
+
-            '[X] You should pass either a path as an argument, '
+    ### Handle ingesting urls from a remote file/feed
-            'or pass a list of links via stdin, but not both.\n'
+    # (e.g. if an RSS feed URL is used as the import path) 
-        )
+    if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        print_help()
+        import_path = save_remote_source(import_path)
-        raise SystemExit(1)
+
    ### Run the main archive update process
    update_archive_data(import_path=import_path, resume=resume)
-    if argc == 1:
+def update_archive_data(import_path=None, resume=None):
-        source, resume = None, None
+    """The main ArchiveBox entrancepoint.  Everything starts here."""
-    elif argc == 2:
+    check_dependencies()
        if all(d.isdigit() for d in sys.argv[1].split('.')):
            # argv[1] is a resume timestamp
            source, resume = None, sys.argv[1]
        else:
            # argv[1] is a path to a file to import
            source, resume = sys.argv[1].strip(), None
    elif argc == 3:
        source, resume = sys.argv[1].strip(), sys.argv[2]
    else:
        print_help()
        raise SystemExit(1)
-    # See if archive folder already exists
+    # Step 1: Load list of links from the existing index
-    for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
+    #         merge in and dedupe new links from import_path
-        if os.path.exists(out_dir):
+    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
            break
    else:
        out_dir = OUTPUT_DIR
-    # Step 0: Download url to local file (only happens if a URL is specified instead of local path) 
+    # Step 2: Write updated index with deduped old and new links back to disk
-    if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links)
        source = save_remote_source(source)
    elif stdin_raw_text:
        source = save_stdin_source(stdin_raw_text)
    # Step 1: Parse the links and dedupe them with existing archive
    all_links, new_links = load_links(archive_path=out_dir, import_path=source)
    # Step 2: Write new index
    write_links_index(out_dir=out_dir, links=all_links)
    # Step 3: Run the archive methods for each link
-    if ONLY_NEW:
+    links = new_links if ONLY_NEW else all_links
-        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
+    log_archiving_started(len(links), resume)
-    else:
+    idx, link = 0, 0
-        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
+    try:
        for idx, link in enumerate(links_after_timestamp(links, resume)):
            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
            archive_link(link_dir, link)
    except KeyboardInterrupt:
        log_archiving_paused(len(links), idx, link and link['timestamp'])
        raise SystemExit(0)
    except:
        print()
        raise    
    log_archiving_finished(len(links))
    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_links(archive_path=out_dir)
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
-    write_links_index(out_dir=out_dir, links=all_links, finished=True)
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
 if __name__ == '__main__':
    main(*sys.argv)
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -3,18 +3,18 @@ import os
 from functools import wraps
 from collections import defaultdict
 from datetime import datetime
 from stdlib_patches import run, PIPE, DEVNULL
 from index import (
    parse_json_link_index,
    write_link_index,
-    update_main_index,
+    patch_links_index,
    load_json_link_index,
 )
 from config import (
    CURL_BINARY,
    GIT_BINARY,
    WGET_BINARY,
    YOUTUBEDL_BINARY,
    CHROME_BINARY,
    FETCH_FAVICON,
    FETCH_TITLE,
    FETCH_WGET,
@ -25,62 +25,37 @@ from config import (
    FETCH_WARC,
    FETCH_GIT,
    FETCH_MEDIA,
    RESOLUTION,
    CHECK_SSL_VALIDITY,
    SUBMIT_ARCHIVE_DOT_ORG,
    COOKIES_FILE,
    WGET_USER_AGENT,
    CHROME_USER_AGENT,
    CHROME_USER_DATA_DIR,
    CHROME_HEADLESS,
    CHROME_SANDBOX,
    TIMEOUT,
    MEDIA_TIMEOUT,
    ANSI,
-    ARCHIVE_DIR,
+    OUTPUT_DIR,
    GIT_DOMAINS,
    GIT_SHA,
    WGET_USER_AGENT,
    CHECK_SSL_VALIDITY,
    COOKIES_FILE,
 )
 from util import (
    domain,
    extension,
    without_query,
    without_fragment,
    fetch_page_title,
    is_static_file,
    progress,
    chmod_file,
    pretty_path,
    print_error_hints,
    check_link_structure,
    wget_output_path,
-    run, PIPE, DEVNULL,
+    chrome_args,
 )
 from logs import (
    _LAST_RUN_STATS,
    log_link_archiving_started,
    log_link_archiving_failed,
 )
 _RESULTS_TOTALS = {   # globals are bad, mmkay
    'skipped': 0,
    'succeded': 0,
    'failed': 0,
 }
 def load_link_index(link_dir, link):
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
    is_new = not os.path.exists(link_dir)
    if is_new:
        os.makedirs(link_dir)
    else:
        link = {
            **parse_json_link_index(link_dir),
            **link,
        }
    check_link_structure(link)
    print_link_status_line(link_dir, link, is_new)
    return link
 class ArchiveError(Exception):
    def __init__(self, message, hints=None):
@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
    active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
    try:
-        link = load_link_index(link_dir, link)
+        is_new = not os.path.exists(link_dir)
        if is_new:
            os.makedirs(link_dir)
        link = load_json_link_index(link_dir, link)
        log_link_archiving_started(link_dir, link, is_new)
        for archive_method in active_methods:
            archive_method(link_dir, link, overwrite=overwrite)
        write_link_index(link_dir, link)
-        update_main_index(link)
+        patch_links_index(link)
    except Exception as err:
        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
    return link
 def print_link_status_line(link_dir, link, is_new):
    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
        symbol='+' if is_new else '*',
        symbol_color=ANSI['green' if is_new else 'black'],
        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        **{**link, 'title': link['title'] or link['url']},
        **ANSI,
    ))
    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
 def attach_result_to_link(method):
    """
@ -178,15 +145,75 @@ def attach_result_to_link(method):
                link['history'][method].append(history_entry)
                link['latest'][method] = result['output']
-            _RESULTS_TOTALS[history_entry['status']] += 1
+            _LAST_RUN_STATS[history_entry['status']] += 1
            return link
        return timed_fetch_func
    return decorator
@attach_result_to_link('title')
 def fetch_title(link_dir, link, timeout=TIMEOUT):
    """try to guess the page's title from its content"""
    # if link already has valid title, skip it
    if link['title'] and not link['title'].lower().startswith('http'):
        return {'output': link['title'], 'status': 'skipped'}
    if is_static_file(link['url']):
        return {'output': None, 'status': 'skipped'}
    end = progress(timeout, prefix='      ')
    try:
        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
        end()
        output = title
    except Exception as e:
        end()
        output = e
        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
    if title and title.strip():
        link['title'] = title
        output = title
    return {
        'cmd': 'fetch_page_title("{}")'.format(link['url']),
        'output': output,
    }
@attach_result_to_link('favicon')
 def fetch_favicon(link_dir, link, timeout=TIMEOUT):
    """download site favicon from google's favicon api"""
    output = 'favicon.ico'
    if os.path.exists(os.path.join(link_dir, output)):
        return {'output': output, 'status': 'skipped'}
    CMD = [
        CURL_BINARY,
        '--max-time', str(timeout),
        '--location',
        '--output', output,
        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
    ]
    end = progress(timeout, prefix='      ')
    try:
        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
        chmod_file(output, cwd=link_dir)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
@attach_result_to_link('wget')
-def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
+def fetch_wget(link_dir, link, timeout=TIMEOUT):
    """download full site using wget"""
    domain_dir = os.path.join(link_dir, domain(link['url']))
@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
    if os.path.exists(domain_dir) and existing_file:
        return {'output': existing_file, 'status': 'skipped'}
-    if warc:
+    if FETCH_WARC:
        warc_dir = os.path.join(link_dir, 'warc')
        os.makedirs(warc_dir, exist_ok=True)
        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
        '-e', 'robots=off',
        '--restrict-file-names=unix',
        '--timeout={}'.format(timeout),
-        *(() if warc else ('--timestamping',)),
+        *(() if FETCH_WARC else ('--timestamping',)),
-        *(('--warc-file={}'.format(warc_path),) if warc else ()),
+        *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
        *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
        *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
        *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
            if line.strip()
        ]
-        # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        # parse out number of files downloaded from last line of stderr:
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        files_downloaded = (
            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
            if 'Downloaded:' in output_tail[-1]
@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
        'output': output,
    }
@attach_result_to_link('pdf')
-def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_pdf(link_dir, link, timeout=TIMEOUT):
    """print PDF of site to file using chrome --headless"""
    if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
    output = 'output.pdf'
    if os.path.exists(os.path.join(link_dir, output)):
        return {'output': output, 'status': 'skipped'}
    CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
        '--print-to-pdf',
        link['url']
    ]
@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
    }
@attach_result_to_link('screenshot')
-def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
    """take screenshot of site using chrome --headless"""
    if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
    output = 'screenshot.png'
    if os.path.exists(os.path.join(link_dir, output)):
        return {'output': output, 'status': 'skipped'}
    CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
        '--screenshot',
        link['url'],
    ]
@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
    }
@attach_result_to_link('dom')
-def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_dom(link_dir, link, timeout=TIMEOUT):
    """print HTML of site to file using chrome --dump-html"""
    if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
    output = 'output.html'
-    if os.path.exists(os.path.join(link_dir, output)):
+    output_path = os.path.join(link_dir, output)
    if os.path.exists(output_path):
        return {'output': output, 'status': 'skipped'}
    CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
        '--dump-dom',
        link['url']
    ]
@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
        'output': output,
    }
@attach_result_to_link('git')
 def fetch_git(link_dir, link, timeout=TIMEOUT):
    """download full site using git"""
    is_clonable_url = (
        domain(link['url']) in GIT_DOMAINS
        or extension(link['url']) == 'git'
    )
    if is_static_file(link['url']) or not is_clonable_url:
        return {'output': None, 'status': 'skipped'}
    output = 'git'
    output_path = os.path.join(link_dir, 'git')
    if os.path.exists(output_path):
        return {'output': output, 'status': 'skipped'}
    os.makedirs(output_path, exist_ok=True)
    CMD = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
        without_query(without_fragment(link['url'])),
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        end()
        if result.returncode == 128:
            # ignore failed re-download when the folder already exists
            pass
        elif result.returncode > 0:
            hints = 'got git response code {}:'.format(result.returncode)
            raise ArchiveError('Failed git download', hints)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
@attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
    output = 'media'
    output_path = os.path.join(link_dir, 'media')
    if os.path.exists(output_path) and not overwrite:
        return {'output': output, 'status': 'skipped'}
    os.makedirs(output_path, exist_ok=True)
    CMD = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '--extract-audio',
        '--keep-video',
        '--ignore-errors',
        '--geo-bypass',
        '--audio-format', 'mp3',
        '--audio-quality', '320K',
        '--embed-thumbnail',
        '--add-metadata',
        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        end()
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                or b'HTTP Error 404' in result.stderr
                or b'HTTP Error 403' in result.stderr
                or b'URL could be a direct video link' in result.stderr
                or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'got youtubedl response code {}:'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
 def parse_archive_dot_org_response(response):
    # Parse archive.org response headers
    headers = defaultdict(list)
@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
        'output': output,
    }
@attach_result_to_link('favicon')
 def fetch_favicon(link_dir, link, timeout=TIMEOUT):
    """download site favicon from google's favicon api"""
    output = 'favicon.ico'
    if os.path.exists(os.path.join(link_dir, output)):
        return {'output': output, 'status': 'skipped'}
    CMD = [
        CURL_BINARY,
        '--max-time', str(timeout),
        '--location',
        '--output', output,
        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
    ]
    end = progress(timeout, prefix='      ')
    try:
        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
        chmod_file(output, cwd=link_dir)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
@attach_result_to_link('title')
 def fetch_title(link_dir, link, timeout=TIMEOUT):
    """try to guess the page's title from its content"""
    # if link already has valid title, skip it
    if link['title'] and not link['title'].lower().startswith('http'):
        return {'output': link['title'], 'status': 'skipped'}
    if is_static_file(link['url']):
        return {'output': None, 'status': 'skipped'}
    end = progress(timeout, prefix='      ')
    try:
        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
        end()
        output = title
    except Exception as e:
        end()
        output = e
        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
    if title and title.strip():
        link['title'] = title
        output = title
    return {
        'cmd': 'fetch_page_title("{}")'.format(link['url']),
        'output': output,
    }
@attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
    output = 'media'
    output_path = os.path.join(link_dir, 'media')
    if os.path.exists(output_path) and not overwrite:
        return {'output': output, 'status': 'skipped'}
    os.makedirs(output_path, exist_ok=True)
    CMD = [
        YOUTUBEDL_BINARY,
        '--write-description',
        '--write-info-json',
        '--write-annotations',
        '--yes-playlist',
        '--write-thumbnail',
        '--no-call-home',
        '--no-check-certificate',
        '--user-agent',
        '--all-subs',
        '--extract-audio',
        '--keep-video',
        '--ignore-errors',
        '--geo-bypass',
        '--audio-format', 'mp3',
        '--audio-quality', '320K',
        '--embed-thumbnail',
        '--add-metadata',
        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
        link['url'],
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        chmod_file(output, cwd=link_dir)
        end()
        if result.returncode:
            if (b'ERROR: Unsupported URL' in result.stderr
                or b'HTTP Error 404' in result.stderr
                or b'HTTP Error 403' in result.stderr
                or b'URL could be a direct video link' in result.stderr
                or b'Unable to extract container ID' in result.stderr):
                # These happen too frequently on non-media pages to warrant printing to console
                pass
            else:
                hints = (
                    'got youtubedl response code {}:'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to download media', hints)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
@attach_result_to_link('git')
 def fetch_git(link_dir, link, timeout=TIMEOUT):
    """download full site using git"""
    url_is_clonable = (
        domain(link['url']) in GIT_DOMAINS
        or link['url'].endswith('.git')
    )
    if not url_is_clonable or is_static_file(link['url']):
        return {'output': None, 'status': 'skipped'}
    output = 'git'
    output_path = os.path.join(link_dir, 'git')
    if os.path.exists(output_path):
        return {'output': output, 'status': 'skipped'}
    os.makedirs(output_path, exist_ok=True)
    CMD = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
        without_query(without_fragment(link['url'])),
    ]
    end = progress(timeout, prefix='      ')
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
        end()
        if result.returncode == 128:
            # ignore failed re-download when the folder already exists
            pass
        elif result.returncode > 0:
            hints = 'got git response code {}:'.format(result.returncode)
            raise ArchiveError('Failed git download', hints)
    except Exception as e:
        end()
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
    return {
        'cmd': CMD,
        'output': output,
    }
 def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
    global CACHED_USER_DATA_DIR
    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
    cmd_args = [binary]
    if headless:
        cmd_args += ('--headless',)
    if not sandbox:
        # dont use GPU or sandbox when running inside docker container
        cmd_args += ('--no-sandbox', '--disable-gpu')
    if not check_ssl_validity:
        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
    if user_agent:
        cmd_args += ('--user-agent={}'.format(user_agent),)
    if resolution:
        cmd_args += ('--window-size={}'.format(RESOLUTION),)
    if timeout:
        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
    # Find chrome user data directory
    default_profile_paths = (
        '~/.config/chromium',
        '~/.config/google-chrome',
        '~/.config/google-chrome-beta',
        '~/.config/google-chrome-unstable',
        '~/Library/Application Support/Chromium',
        '~/Library/Application Support/Google/Chrome',
        '~/Library/Application Support/Google/Chrome Canary',
        '~/AppData/Local/Chromium/User Data',
        '~/AppData/Local/Google/Chrome/User Data',
        '~/AppData/Local/Google/Chrome SxS/User Data',
    )
    if user_data_dir:
        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
    else:
        for path in default_profile_paths:
            full_path = os.path.expanduser(path)
            if os.path.exists(full_path):
                CACHED_USER_DATA_DIR = full_path
                cmd_args.append('--user-data-dir={}'.format(full_path))
                break
    return cmd_args
 CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
--- a/archivebox/index.py
+++ b/archivebox/index.py
@ -12,18 +12,24 @@ except ImportError:
 from config import (
    OUTPUT_DIR,
    TEMPLATES_DIR,
    ANSI,
    GIT_SHA,
    FOOTER_INFO,
 )
 from util import (
    chmod_file,
    derived_link_info,
    pretty_path,
    check_link_structure,
    check_links_structure,
    wget_output_path,
 )
 from parse import parse_links
 from links import validate_links
 from logs import (
    log_indexing_started,
    log_indexing_finished,
    log_parsing_started,
    log_parsing_finished,
 )
 TITLE_LOADING_MSG = 'Not yet archived...'
@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 def write_links_index(out_dir, links, finished=False):
    """create index.html file for a given list of links"""
    log_indexing_started()
    check_links_structure(links)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print('{green}[*] [{}] Saving main index files...{reset}'.format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        **ANSI,
    ))
    write_json_links_index(out_dir, links)
-    print('    > {}/index.json'.format(pretty_path(out_dir)))
+    log_indexing_finished(out_dir, 'index.json')
    write_html_links_index(out_dir, links, finished=finished)
-    print('    > {}/index.html'.format(pretty_path(out_dir)))
+    log_indexing_finished(out_dir, 'index.html')
 def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
    """parse and load existing index with any new links from import_path merged in"""
    existing_links = []
    if out_dir:
        existing_links = parse_json_links_index(out_dir)
        check_links_structure(existing_links)
    new_links = []
    if import_path:
        # parse and validate the import file
        log_parsing_started(import_path)
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)
    # merge existing links in out_dir and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)
    if import_path and parser_name:
        log_parsing_finished(num_new_links, parser_name)
    return all_links, new_links
 def write_json_links_index(out_dir, links):
    """write the json link index to a given path"""
@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
    chmod_file(path)
-def parse_json_links_index(out_dir):
+def parse_json_links_index(out_dir=OUTPUT_DIR):
-    """load the index in a given directory and merge it with the given link"""
+    """parse a archive index json file and return the list of links"""
    index_path = os.path.join(out_dir, 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
    chmod_file(path)
-def update_main_index(link):
+def patch_links_index(link, out_dir=OUTPUT_DIR):
    """hack to in-place update one row's info in the generated index html"""
    title = link['latest']['title']
    successful = len([entry for entry in link['latest'].values() if entry])
    # Patch JSON index
    json_path = os.path.join(OUTPUT_DIR, 'index.json')
    links = parse_json_links_index(OUTPUT_DIR)
    changed = False
-    for json_link in links:
+    json_file_links = parse_json_links_index(out_dir)
-        if json_link['url'] == link['url']:
+    for saved_link in json_file_links:
-            json_link['title'] = title
+        if saved_link['url'] == link['url']:
-            json_link['latest'] = link['latest']
+            saved_link['title'] = title
            saved_link['latest'] = link['latest']
            changed = True
            break
    if changed:
-        write_json_links_index(OUTPUT_DIR, links)
+        write_json_links_index(out_dir, json_file_links)
    # Patch HTML index
-    html_path = os.path.join(OUTPUT_DIR, 'index.html')
+    html_path = os.path.join(out_dir, 'index.html')
    html = open(html_path, 'r').read().split('\n')
    for idx, line in enumerate(html):
        if title and ('<span data-title-for="{}"'.format(link['url']) in line):
@ -172,6 +192,7 @@ def update_main_index(link):
    with open(html_path, 'w') as f:
        f.write('\n'.join(html))
 ### Individual link index
 def write_link_index(out_dir, link):
@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
            return link_json
    return {}
 def load_json_link_index(out_dir, link):
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
    link = {
        **parse_json_link_index(out_dir),
        **link,
    }
    check_link_structure(link)
    return link
 def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
-            'extension': link['extension'] or 'HTML',
+            'extension': link['extension'] or 'html',
            'tags': link['tags'].strip() or 'untagged',
            'status': 'Archived' if link['is_archived'] else 'Not yet archived',
            'status_color': 'success' if link['is_archived'] else 'danger',
        }))
    chmod_file(path)
--- a/archivebox/logs.py
+++ b/archivebox/logs.py
@ -0,0 +1,161 @@
 import sys
 from datetime import datetime
 from config import ANSI, REPO_DIR, OUTPUT_DIR
 # globals are bad, mmkay
 _LAST_RUN_STATS = {
    'skipped': 0,
    'succeded': 0,
    'failed': 0,
    'parsing_start_ts': 0,
    'parsing_end_ts': 0,
    'indexing_start_ts': 0,
    'indexing_end_ts': 0,
    'archiving_start_ts': 0,
    'archiving_end_ts': 0,
    'links': {},
 }
 def pretty_path(path):
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
    return path.replace(REPO_DIR + '/', '')
 def log_link_archiving_started(link_dir, link, is_new):
    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
        symbol='+' if is_new else '*',
        symbol_color=ANSI['green' if is_new else 'black'],
        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        **{**link, 'title': link['title'] or link['url']},
        **ANSI,
    ))
    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
 def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix='        '):
    """quote the argument with whitespace in a command so the user can 
       copy-paste the outputted string directly to run the cmd
    """
    # Prettify CMD string and make it save to copy-paste by quoting arguments
    quoted_cmd = ' '.join(
        '"{}"'.format(arg) if ' ' in arg else arg
        for arg in cmd
    )
    # Prettify error output hints string and limit to five lines
    hints = hints or getattr(err, 'hints', None)
    if hints:
        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
        hints = (
            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
            for line in hints[:5] if line.strip()
        )
    else:
        hints = ()
    output_lines = [
        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
        *hints,
        'Run to see full output:'        
        '    cd {};'.format(pwd),
        '    {}'.format(quoted_cmd),
    ]
    return '\n'.join(
        '{}{}'.format(prefix, line)
        for line in output_lines
        if line
    )
 ### Logging Helpers
 def log_parsing_started(source_file):
    start_ts = datetime.now()
    _LAST_RUN_STATS['parse_start_ts'] = start_ts
    print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
        source_file.rsplit('/', 1)[-1],
        **ANSI,
    ))
 def log_parsing_finished(num_new_links, parser_name):
    print('    > Adding {} new links to index (parsed import as {})'.format(
        num_new_links,
        parser_name,
    ))
 def log_indexing_started():
    start_ts = datetime.now()
    _LAST_RUN_STATS['index_start_ts'] = start_ts
    print('{green}[*] [{}] Saving main index files...{reset}'.format(
        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
        **ANSI,
    ))
 def log_indexing_finished(out_dir, out_file):
    end_ts = datetime.now()
    _LAST_RUN_STATS['index_end_ts'] = end_ts
    print('    > {}/{}'.format(pretty_path(out_dir), out_file))
 def log_archiving_started(num_links, resume):
    start_ts = datetime.now()
    _LAST_RUN_STATS['start_ts'] = start_ts
    if resume:
        print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
             num_links,
             resume,
             **ANSI,
        ))
    else:
        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
             num_links,
             **ANSI,
        ))
 def log_archiving_paused(num_links, idx, timestamp):
    end_ts = datetime.now()
    _LAST_RUN_STATS['end_ts'] = end_ts
    print()
    print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
        **ANSI,
        now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
        idx=idx+1,
        timestamp=timestamp,
        total=num_links,
    ))
    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
    print('    Continue where you left off by running:')
    print('        {} {}'.format(
        pretty_path(sys.argv[0]),
        timestamp,
    ))
 def log_archiving_finished(num_links):
    end_ts = datetime.now()
    _LAST_RUN_STATS['end_ts'] = end_ts
    seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
    if seconds > 60:
        duration = '{0:.2f} min'.format(seconds / 60, 2)
    else:
        duration = '{0:.2f} sec'.format(seconds, 2)
    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
        ANSI['green'],
        end_ts.strftime('%Y-%m-%d %H:%M:%S'),
        num_links,
        duration,
        ANSI['reset'],
    ))
    print('    - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
    print('    - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
    print('    - {} errors'.format(_LAST_RUN_STATS['failed']))
    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -1,17 +1,19 @@
 # coding: utf-8
 """
-Everything related to parsing links from bookmark services.
+Everything related to parsing links from input sources.
 For a list of supported services, see the README.md.
-For examples of supported files see examples/.
+For examples of supported import formats see tests/.
-Parsed link schema: {
+Link: {
    'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
-    'timestamp': '15442123124234',
+    'timestamp': '1544212312.4234',
    'title': 'Example.com Page Title',
    'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
    'tags': 'abc,def',
    'sources': [
        'output/sources/ril_export.html',
        'output/sources/getpocket.com-1523422111.txt',
        'output/sources/stdin-234234112312.txt'
    ]
 }
 """
@ -19,45 +21,59 @@ import re
 import json
 from datetime import datetime
 from collections import OrderedDict
 import xml.etree.ElementTree as etree
-from config import ANSI
+from config import TIMEOUT
 from util import (
    str_between,
    URL_REGEX,
-    check_url_parsing,
+    check_url_parsing_invariants,
    progress,
 )
-def parse_links(path):
+def parse_links(source_file):
-    """parse a list of links dictionaries from a bookmark export file"""
+    """parse a list of URLs with their metadata from an 
-    
+       RSS feed, bookmarks export, or text file
-    check_url_parsing()
+    """
-    links = []
+    check_url_parsing_invariants()
-    with open(path, 'r', encoding='utf-8') as file:
+    PARSERS = (
-        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
+        # Specialized parsers
-            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        ('Pocket HTML', parse_pocket_html_export),
-            path.rsplit('/', 1)[-1],
+        ('Pinboard RSS', parse_pinboard_rss_export),
-            **ANSI,
+        ('Shaarli RSS', parse_shaarli_rss_export),
-        ))
+        ('Medium RSS', parse_medium_rss_export),
        # General parsers
        ('Netscape HTML', parse_netscape_html_export),
        ('Generic RSS', parse_rss_export),
        ('Generic JSON', parse_json_export),
-        for parser_name, parser_func in PARSERS.items():
+        # Fallback parser
        ('Plain Text', parse_plain_text_export),
    )
    end = progress(TIMEOUT * 4, prefix='      ')
    with open(source_file, 'r', encoding='utf-8') as file:
        for parser_name, parser_func in PARSERS:
            try:
-                links += list(parser_func(file))
+                links = list(parser_func(file))
                if links:
-                    break
+                    end()
                    return links, parser_name
            except Exception as err:
-                # we try each parser one by one, wong parsers will throw exeptions
+                # Parsers are tried one by one down the list, and the first one
-                # if unsupported and we accept the first one that passes
+                # that succeeds is used. To see why a certain parser was not used
-                # uncomment the following line to see why the parser was unsupported for each attempted format
+                # due to error or format incompatibility, uncomment this line:
                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
                pass
-    return links, parser_name
+    end()
    return [], 'Plain Text'
 ### Import Parser Functions
 def parse_pocket_html_export(html_file):
    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
                'sources': [html_file.name],
            }
-def parse_pinboard_json_export(json_file):
+
 def parse_json_export(json_file):
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
    json_file.seek(0)
-    json_content = json.load(json_file)
+    links = json.load(json_file)
-    for line in json_content:
+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
    for link in links:
        # example line
        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
-        if line:
+        if link:
-            erg = line
+            # Parse URL
-            if erg.get('timestamp'):
+            url = link.get('href') or link.get('url') or link.get('URL')
-                timestamp = str(erg['timestamp']/10000000)  # chrome/ff histories use a very precise timestamp
+            if not url:
-            elif erg.get('time'):
+                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
                timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
            elif erg.get('created_at'):
                timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
            else:
                timestamp = str(datetime.now().timestamp())
            if erg.get('href'):
                url = erg['href']
            else:
                url = erg['url']
            if erg.get('description'):
                title = (erg.get('description') or '').replace(' — Readability', '')
            else:
                title = erg['title'].strip()
-            info = {
+            # Parse the timestamp
            ts_str = str(datetime.now().timestamp())
            if link.get('timestamp'):
                # chrome/ff histories use a very precise timestamp
                ts_str = str(link['timestamp'] / 10000000)  
            elif link.get('time'):
                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
            elif link.get('created_at'):
                ts_str = str(json_date(link['created_at']).timestamp())
            elif link.get('created'):
                ts_str = str(json_date(link['created']).timestamp())
            elif link.get('date'):
                ts_str = str(json_date(link['date']).timestamp())
            elif link.get('bookmarked'):
                ts_str = str(json_date(link['bookmarked']).timestamp())
            elif link.get('saved'):
                ts_str = str(json_date(link['saved']).timestamp())
            # Parse the title
            title = None
            if link.get('title'):
                title = link['title'].strip() or None
            elif link.get('description'):
                title = link['description'].replace(' — Readability', '').strip() or None
            elif link.get('name'):
                title = link['name'].strip() or None
            yield {
                'url': url,
-                'timestamp': timestamp,
+                'timestamp': ts_str,
-                'title': title or None,
+                'title': title,
-                'tags': erg.get('tags') or '',
+                'tags': link.get('tags') or '',
                'sources': [json_file.name],
            }
            yield info
 def parse_rss_export(rss_file):
@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
        def get_row(key):
            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
        url = str_between(get_row('link'), '<link>', '</link>')
        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
        title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
        yield {
            'url': url,
            'timestamp': str(time.timestamp()),
-            'title': title or None,
+            'title': title,
            'tags': '',
            'sources': [rss_file.name],
        }
@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
        tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
        title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
        ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
        #       = 🌈🌈🌈🌈
        #        = 🌈🌈🌈🌈
        #         = 🏆🏆🏆🏆
        # Pinboard includes a colon in its date stamp timezone offsets, which
        # Python can't parse. Remove it:
@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
    root = etree.parse(rss_file).getroot()
    items = root.find("channel").findall("item")
    for item in items:
        # for child in item:
        #     print(child.tag, child.text)
        url = item.find("link").text
        title = item.find("title").text.strip()
        ts_str = item.find("pubDate").text
@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
    """Parse raw links from each line in a text file"""
    text_file.seek(0)
-    text_content = text_file.readlines()
+    for line in text_file.readlines():
-    for line in text_content:
+        urls = re.findall(URL_REGEX, line) if line.strip() else ()
-        if line:
+        for url in urls:
-            urls = re.findall(URL_REGEX, line)
+            yield {
-            
+                'url': url,
-            for url in urls:
+                'timestamp': str(datetime.now().timestamp()),
-                url = url.strip()
+                'title': None,
-                time = datetime.now()
+                'tags': '',
-                
+                'sources': [text_file.name],
-                yield {
+            }
                    'url': url,
                    'timestamp': str(time.timestamp()),
                    'title': None,
                    'tags': '',
                    'sources': [text_file.name],
                }
 PARSERS = OrderedDict([
    ('Pocket HTML', parse_pocket_html_export),
    ('Pinboard JSON', parse_pinboard_json_export),
    ('Netscape HTML', parse_netscape_html_export),
    ('RSS', parse_rss_export),
    ('Pinboard RSS', parse_pinboard_rss_export),
    ('Shaarli RSS', parse_shaarli_rss_export),
    ('Medium RSS', parse_medium_rss_export),
    ('Plain Text', parse_plain_text_export),
 ])
--- a/archivebox/stdlib_patches.py
+++ b/archivebox/stdlib_patches.py
@ -1,10 +1,64 @@
 """
 Patches, additions, and shortcuts for Python standard library functions.
 """
 ### subprocess
 from subprocess import (
    Popen,
    PIPE,
    DEVNULL, 
    CompletedProcess,
    TimeoutExpired,
    CalledProcessError,
 )
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
    if input is not None:
        if 'stdin' in kwargs:
            raise ValueError('stdin and input arguments may not both be used.')
        kwargs['stdin'] = PIPE
    if capture_output:
        if ('stdout' in kwargs) or ('stderr' in kwargs):
            raise ValueError('stdout and stderr arguments may not be used '
                             'with capture_output.')
        kwargs['stdout'] = PIPE
        kwargs['stderr'] = PIPE
    with Popen(*popenargs, **kwargs) as process:
        try:
            stdout, stderr = process.communicate(input, timeout=timeout)
        except TimeoutExpired:
            process.kill()
            try:
                stdout, stderr = process.communicate(input, timeout=2)
            except:
                pass
            raise TimeoutExpired(popenargs[0][0], timeout)
        except BaseException as err:
            process.kill()
            # We don't call process.wait() as .__exit__ does that for us.
            raise 
        retcode = process.poll()
        if check and retcode:
            raise CalledProcessError(retcode, process.args,
                                     output=stdout, stderr=stderr)
    return CompletedProcess(process.args, retcode, stdout, stderr)
 ### collections
 from sys import maxsize
 from itertools import islice
 from collections import deque
 _marker = object()
-class Peekable(object):
+class PeekableGenerator:
    """Peekable version of a normal python generator.
       Useful when you don't want to evaluate the entire iterable to look at
       a specific item at a given idx.
@ -74,8 +128,6 @@ class Peekable(object):
        return next(self._it)
    next = __next__  # For Python 2 compatibility
    def _get_slice(self, index):
        # Normalize the slice's arguments
        step = 1 if (index.step is None) else index.step
--- a/archivebox/templates/link_index.html
+++ b/archivebox/templates/link_index.html
@ -192,22 +192,27 @@
                    Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
                    &nbsp; | &nbsp;
                    Last updated: <small title="Timestamp: $updated">$updated_date</small>
                    &nbsp; | &nbsp;
                    Total files: <small title="Archive methods">🗃 $num_outputs</small>
                </div>
                <div class="col-lg-4 alert well">
                    Type: 
                    <span class="badge badge-default">$extension</span>
                    &nbsp; | &nbsp;
                    Tags:
-                    <span class="badge badge-success">$tags</span> 
+                    <span class="badge badge-warning">$tags</span> 
                    &nbsp; | &nbsp;
                    Status:
                    <span class="badge badge-$status_color">$status</span> 
                </div>
                <div class="col-lg-4 alert well">
-                    Download:
+                    Archive Methods:
                    <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                    <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                    <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                    <a href="git/" title="Any git repos at the url">Git Repos</a> | 
                    <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
-                    <a href="." title="Webserver-provided index of files directory.">More files...</a>
+                    <a href="." title="Webserver-provided index of files directory.">See all files...</a>
                </div>
                <hr/>
                <div class="col-lg-2">
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
 from decimal import Decimal
 from datetime import datetime
 from multiprocessing import Process
 from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
 from stdlib_patches import run, PIPE, DEVNULL
 from config import (
    ANSI,
    TERM_WIDTH,
@ -19,8 +19,6 @@ from config import (
    OUTPUT_PERMISSIONS,
    TIMEOUT,
    SHOW_PROGRESS,
    CHECK_SSL_VALIDITY,
    WGET_USER_AGENT,
    CURL_BINARY,
    WGET_BINARY,
    CHROME_BINARY,
@ -37,6 +35,13 @@ from config import (
    FETCH_MEDIA,
    SUBMIT_ARCHIVE_DOT_ORG,
    ARCHIVE_DIR_NAME,
    RESOLUTION,
    CHECK_SSL_VALIDITY,
    WGET_USER_AGENT,
    CHROME_USER_AGENT,
    CHROME_USER_DATA_DIR,
    CHROME_HEADLESS,
    CHROME_SANDBOX,
 )
 ### Parsing Helpers
@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 short_ts = lambda ts: ts.split('.')[0]
 urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
 URL_REGEX = re.compile(
    r'http[s]?://'                    # start matching from allowed schemes
@ -109,66 +115,74 @@ def check_links_structure(links):
 def check_dependencies():
    """Check that all necessary dependencies are installed, and have valid versions"""
-    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
+    try:
-    if python_vers < 3.5:
+        python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+        if python_vers < 3.5:
-        print('    See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
+            print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
            print('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
            raise SystemExit(1)
        if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
            if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
                print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
                print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
        if FETCH_WGET or FETCH_WARC:
            if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
                print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
                print('    Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
        if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
            if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
                print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
            # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
            try:
                result = run([CHROME_BINARY, '--version'], stdout=PIPE)
                version_str = result.stdout.decode('utf-8')
                version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
                version = [l for l in version_lines if l.isdigit()][-1]
                if int(version) < 59:
                    print(version_lines)
                    print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
                    print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                    raise SystemExit(1)
            except (IndexError, TypeError, OSError):
                print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
        if FETCH_GIT:
            if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
                print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
                print('    Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
        if FETCH_MEDIA:
            if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
                print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
                print('    Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                raise SystemExit(1)
    except (KeyboardInterrupt, Exception):
        raise SystemExit(1)
-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+def check_url_parsing_invariants():
        if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
    if FETCH_WGET or FETCH_WARC:
        if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
    if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
        if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
        try:
            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
            version_str = result.stdout.decode('utf-8')
            version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
            version = [l for l in version_lines if l.isdigit()][-1]
            if int(version) < 59:
                print(version_lines)
                print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
                print('    See https://github.com/pirate/ArchiveBox for help.')
                raise SystemExit(1)
        except (IndexError, TypeError, OSError):
            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
    if FETCH_GIT:
        if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
            print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
    if FETCH_MEDIA:
        if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
            print('    See https://github.com/pirate/ArchiveBox for help.')
            raise SystemExit(1)
 def check_url_parsing():
    """Check that plain text regex URL parsing works as expected"""
    # this is last-line-of-defense to make sure the URL_REGEX isn't
    # misbehaving, as the consequences could be disastrous and lead to many
    # incorrect/badly parsed links being added to the archive
    test_urls = '''
    https://example1.com/what/is/happening.html?what=1#how-about-this=1
    https://example2.com/what/is/happening/?what=1#how-about-this=1
@ -276,22 +290,9 @@ def wget_output_path(link):
    if link.get('latest', {}).get('wget'):
        return link['latest']['wget']
    urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
    if is_static_file(link['url']):
        return urlencode(without_scheme(without_fragment(link['url'])))
    # Since the wget algorithm to for -E (appending .html) is incredibly complex
    # instead of trying to emulate it here, we just look in the output folder
    # to see what html file wget actually created as the output
    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
    full_path = without_fragment(without_query(path(link['url']))).strip('/')
    search_dir = os.path.join(
        link_dir,
        domain(link['url']),
        full_path,
    )
    # Wget downloads can save in a number of different ways depending on the url
    #    https://example.com
    #       > output/archive/<timestamp>/example.com/index.html
@ -304,6 +305,19 @@ def wget_output_path(link):
    # There's also lots of complexity around how the urlencoding and renaming
    # is done for pages with query and hash fragments or extensions like shtml / htm
    # Since the wget algorithm for -E (appending .html) is incredibly complex
    # and there's no way to get the computed output path from wget
    # in order to avoid having to reverse-engineer how they calculate it,
    # we just look in the output folder read the filename wget used from the filesystem
    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
    full_path = without_fragment(without_query(path(link['url']))).strip('/')
    search_dir = os.path.join(
        link_dir,
        domain(link['url']),
        full_path,
    )
    for _ in range(4):
        if os.path.exists(search_dir):
            if os.path.isdir(search_dir):
@ -356,47 +370,6 @@ def str_between(string, start, end=None):
    return content
 def pretty_path(path):
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
    return path.replace(REPO_DIR + '/', '')
 def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
    """quote the argument with whitespace in a command so the user can 
       copy-paste the outputted string directly to run the cmd
    """
    # Prettify CMD string and make it save to copy-paste by quoting arguments
    quoted_cmd = ' '.join(
        '"{}"'.format(arg) if ' ' in arg else arg
        for arg in cmd
    )
    # Prettify error output hints string and limit to five lines
    hints = hints or getattr(err, 'hints', None)
    if hints:
        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
        hints = (
            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
            for line in hints[:5] if line.strip()
        )
    else:
        hints = ()
    output_lines = [
        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
        *hints,
        'Run to see full output:'        
        '    cd {};'.format(pwd),
        '    {}'.format(quoted_cmd),
    ]
    return '\n'.join(
        '{}{}'.format(prefix, line)
        for line in output_lines
        if line
    )
 ### Link Helpers
@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
        print('     ', chmod_result.stderr.decode())
        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
-    if input is not None:
+CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
        if 'stdin' in kwargs:
            raise ValueError('stdin and input arguments may not both be used.')
        kwargs['stdin'] = PIPE
-    if capture_output:
+def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
-        if ('stdout' in kwargs) or ('stderr' in kwargs):
+               headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
-            raise ValueError('stdout and stderr arguments may not be used '
+               check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
-                             'with capture_output.')
+               resolution=RESOLUTION, timeout=TIMEOUT):
-        kwargs['stdout'] = PIPE
+    """helper to build up a chrome shell command with arguments"""
        kwargs['stderr'] = PIPE
-    with Popen(*popenargs, **kwargs) as process:
+    global CACHED_USER_DATA_DIR
-        try:
+    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
-            stdout, stderr = process.communicate(input, timeout=timeout)
+    cmd_args = [binary]
-        except TimeoutExpired:
+
-            process.kill()
+    if headless:
-            try:
+        cmd_args += ('--headless',)
-                stdout, stderr = process.communicate(input, timeout=2)
+    
-            except:
+    if not sandbox:
-                pass
+        # dont use GPU or sandbox when running inside docker container
-            raise TimeoutExpired(popenargs[0][0], timeout)
+        cmd_args += ('--no-sandbox', '--disable-gpu')
-        except BaseException as err:
+
-            process.kill()
+    if not check_ssl_validity:
-            # We don't call process.wait() as .__exit__ does that for us.
+        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
-            raise 
+
-        retcode = process.poll()
+    if user_agent:
-        if check and retcode:
+        cmd_args += ('--user-agent={}'.format(user_agent),)
-            raise CalledProcessError(retcode, process.args,
+
-                                     output=stdout, stderr=stderr)
+    if resolution:
-    return CompletedProcess(process.args, retcode, stdout, stderr)
+        cmd_args += ('--window-size={}'.format(RESOLUTION),)
    if timeout:
        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
    # Find chrome user data directory
    default_profile_paths = (
        '~/.config/chromium',
        '~/.config/google-chrome',
        '~/.config/google-chrome-beta',
        '~/.config/google-chrome-unstable',
        '~/Library/Application Support/Chromium',
        '~/Library/Application Support/Google/Chrome',
        '~/Library/Application Support/Google/Chrome Canary',
        '~/AppData/Local/Chromium/User Data',
        '~/AppData/Local/Google/Chrome/User Data',
        '~/AppData/Local/Google/Chrome SxS/User Data',
    )
    if user_data_dir:
        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
    else:
        for path in default_profile_paths:
            full_path = os.path.expanduser(path)
            if os.path.exists(full_path):
                CACHED_USER_DATA_DIR = full_path
                cmd_args.append('--user-data-dir={}'.format(full_path))
                break
    return cmd_args