split up utils into separate files

2025-05-09 12:21:57 -04:00 · 2019-04-30 23:13:04 -04:00 · 2019-04-30 23:13:04 -04:00 · 95007d9137
commit 95007d9137
parent daf5951897
23 changed files with 820 additions and 759 deletions
--- a/archivebox/cli/logging.py
+++ b/archivebox/cli/logging.py
@ -1,14 +1,28 @@
 __package__ = 'archivebox.cli'

+import re
 import os
 import sys
+import time
+import argparse

 from datetime import datetime
 from dataclasses import dataclass
-from typing import Optional, List
+from multiprocessing import Process
+from typing import Optional, List, Dict, Union, IO

 from ..index.schema import Link, ArchiveResult
-from ..config import ANSI, OUTPUT_DIR, IS_TTY
+from ..index.json import to_json
+from ..index.csv import links_to_csv
+from ..util import enforce_types
+from ..config import (
+    ConfigDict,
+    ANSI,
+    OUTPUT_DIR,
+    IS_TTY,
+    SHOW_PROGRESS,
+    TERM_WIDTH,
+)


@dataclass
@ -32,11 +46,104 @@ class RuntimeStats:
 _LAST_RUN_STATS = RuntimeStats()


-def pretty_path(path: str) -> str:
-    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
-    pwd = os.path.abspath('.')
-    # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
-    return path.replace(pwd + '/', './')
+
+class SmartFormatter(argparse.HelpFormatter):
+    """Patched formatter that prints newlines in argparse help strings"""
+    def _split_lines(self, text, width):
+        if '\n' in text:
+            return text.splitlines()
+        return argparse.HelpFormatter._split_lines(self, text, width)
+
+
+def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
+    """Tell the user they passed stdin to a command that doesn't accept it"""
+
+    if stdin and not stdin.isatty():
+        stdin_raw_text = stdin.read().strip()
+        if stdin_raw_text:
+            print(
+                '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
+                    caller,
+                    **ANSI,
+                )
+            )
+            print('    Run archivebox "{} --help" to see usage and examples.'.format(
+                caller,
+            ))
+            print()
+            raise SystemExit(1)
+
+def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
+    if stdin and not stdin.isatty():
+        return stdin.read()
+    return None
+
+
+class TimedProgress:
+    """Show a progress bar and measure elapsed time until .end() is called"""
+
+    def __init__(self, seconds, prefix=''):
+        if SHOW_PROGRESS:
+            self.p = Process(target=progress_bar, args=(seconds, prefix))
+            self.p.start()
+
+        self.stats = {'start_ts': datetime.now(), 'end_ts': None}
+
+    def end(self):
+        """immediately end progress, clear the progressbar line, and save end_ts"""
+
+        end_ts = datetime.now()
+        self.stats['end_ts'] = end_ts
+        if SHOW_PROGRESS:
+            # protect from double termination
+            #if p is None or not hasattr(p, 'kill'):
+            #    return
+            if self.p is not None:
+                self.p.terminate()
+            
+            self.p = None
+
+            sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))  # clear whole terminal line
+
+
+@enforce_types
+def progress_bar(seconds: int, prefix: str='') -> None:
+    """show timer in the form of progress bar, with percentage and seconds remaining"""
+    chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
+    chunks = TERM_WIDTH() - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
+    try:
+        for s in range(seconds * chunks):
+            chunks = TERM_WIDTH() - len(prefix) - 20
+            progress = s / chunks / seconds * 100
+            bar_width = round(progress/(100/chunks))
+
+            # ████████████████████           0.9% (1/60sec)
+            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
+                prefix,
+                ANSI['green'],
+                (chunk * bar_width).ljust(chunks),
+                ANSI['reset'],
+                round(progress, 1),
+                round(s/chunks),
+                seconds,
+            ))
+            sys.stdout.flush()
+            time.sleep(1 / chunks)
+
+        # ██████████████████████████████████ 100.0% (60/60sec)
+        sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
+            prefix,
+            ANSI['red'],
+            chunk * chunks,
+            ANSI['reset'],
+            100.0,
+            seconds,
+            seconds,
+        ))
+        sys.stdout.flush()
+    except KeyboardInterrupt:
+        print()
+        pass


 ### Parsing Stage
@ -223,10 +330,9 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
    print('    {}'.format(' '.join(filter_patterns or ())))

 def log_list_finished(links):
-    from ..util import links_to_csv
    print()
    print('---------------------------------------------------------------------------------------------------')
-    print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
+    print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
    print('---------------------------------------------------------------------------------------------------')
    print()

@ -266,3 +372,129 @@ def log_removal_finished(all_links: int, to_keep: int):
            **ANSI,
        ))
        print('    Index now contains {} links.'.format(to_keep))
+
+
+def log_shell_welcome_msg():
+    from . import list_subcommands
+
+    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
+    print('{green}from archivebox.core.models import Page, User{reset}'.format(**ANSI))
+    print('{green}from archivebox import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
+    print()
+    print('[i] Welcome to the ArchiveBox Shell!')
+    print('    https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
+    print()
+    print('    {lightred}Hint:{reset} Example use:'.format(**ANSI))
+    print('        print(Page.objects.filter(is_archived=True).count())')
+    print('        Page.objects.get(url="https://example.com").as_json()')
+    print('        add("https://example.com/some/new/url")')
+
+
+
+### Helpers
+
+@enforce_types
+def pretty_path(path: str) -> str:
+    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
+    pwd = os.path.abspath('.')
+    # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
+    return path.replace(pwd + '/', './')
+
+
+@enforce_types
+def printable_filesize(num_bytes: Union[int, float]) -> str:
+    for count in ['Bytes','KB','MB','GB']:
+        if num_bytes > -1024.0 and num_bytes < 1024.0:
+            return '%3.1f %s' % (num_bytes, count)
+        num_bytes /= 1024.0
+    return '%3.1f %s' % (num_bytes, 'TB')
+
+
+@enforce_types
+def printable_folders(folders: Dict[str, Optional[Link]],
+                      json: bool=False,
+                      csv: Optional[str]=None) -> str:
+    if json: 
+        return to_json(folders.values(), indent=4, sort_keys=True)
+
+    elif csv:
+        return links_to_csv(folders.values(), cols=csv.split(','), header=True)
+    
+    return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
+
+
+
+@enforce_types
+def printable_config(config: ConfigDict, prefix: str='') -> str:
+    return f'\n{prefix}'.join(
+        f'{key}={val}'
+        for key, val in config.items()
+        if not (isinstance(val, dict) or callable(val))
+    )
+
+
+@enforce_types
+def printable_folder_status(name: str, folder: Dict) -> str:
+    if folder['enabled']:
+        if folder['is_valid']:
+            color, symbol, note = 'green', '√', 'valid'
+        else:
+            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
+    else:
+        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
+
+    if folder['path']:
+        if os.path.exists(folder['path']):
+            num_files = (
+                f'{len(os.listdir(folder["path"]))} files'
+                if os.path.isdir(folder['path']) else
+                printable_filesize(os.path.getsize(folder['path']))
+            )
+        else:
+            num_files = 'missing'
+
+        if ' ' in folder['path']:
+            folder['path'] = f'"{folder["path"]}"'
+
+    return ' '.join((
+        ANSI[color],
+        symbol,
+        ANSI['reset'],
+        name.ljust(22),
+        (folder["path"] or '').ljust(76),
+        num_files.ljust(14),
+        ANSI[color],
+        note,
+        ANSI['reset'],
+    ))
+
+
+@enforce_types
+def printable_dependency_version(name: str, dependency: Dict) -> str:
+    if dependency['enabled']:
+        if dependency['is_valid']:
+            color, symbol, note, version = 'green', '√', 'valid', ''
+
+            parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
+            if parsed_version_num:
+                version = f'v{parsed_version_num[0]}'
+
+        if not version:
+            color, symbol, note, version = 'red', 'X', 'invalid', '?'
+    else:
+        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+
+    if ' ' in dependency["path"]:
+        dependency["path"] = f'"{dependency["path"]}"'
+
+    return ' '.join((
+        ANSI[color],
+        symbol,
+        ANSI['reset'],
+        name.ljust(22),
+        (dependency["path"] or '').ljust(76),
+        version.ljust(14),
+        ANSI[color],
+        note,
+        ANSI['reset'],
+    ))
--- a/archivebox/config/init.py
+++ b/archivebox/config/init.py
@ -119,6 +119,27 @@ DEFAULT_CLI_COLORS = {
 }
 ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}

+STATICFILE_EXTENSIONS = {
+    # 99.999% of the time, URLs ending in these extentions are static files
+    # that can be downloaded as-is, not html pages that need to be rendered
+    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
+    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
+    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 
+    'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
+    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
+    'atom', 'rss', 'css', 'js', 'json',
+    'dmg', 'iso', 'img',
+    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
+
+    # Less common extensions to consider adding later
+    # jar, swf, bin, com, exe, dll, deb
+    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, 
+    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
+    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
+
+    # Thse are always treated as pages, not as static files, never add them:
+    # html, htm, shtml, xhtml, xml, aspx, php, cgi
+}

 VERSION_FILENAME = 'VERSION'
 PYTHON_DIR_NAME = 'archivebox'
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -64,3 +64,7 @@ class Page(models.Model):
    @property
    def base_url(self):
        return self.as_link().base_url
+
+    @property
+    def link_dir(self):
+        return self.as_link().link_dir
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -4,17 +4,19 @@ import os
 import sys

 SECRET_KEY = '---------------- not a valid secret key ! ----------------'
-DEBUG = True
+DEBUG = os.getenv('DEBUG', 'False').lower() == 'true'
 ALLOWED_HOSTS = ['*']

 REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir))
 OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
+ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive')
 DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')

 ACTIVE_THEME = 'default'

 IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]

+APPEND_SLASH = True

 INSTALLED_APPS = [
    'django.contrib.auth',
--- a/archivebox/core/welcome_message.py
+++ b/archivebox/core/welcome_message.py
@ -1,17 +1,6 @@
-from cli import list_subcommands
-
-from .config import ANSI
+from cli.logging import log_shell_welcome_msg


 if __name__ == '__main__':
-    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
-    # print('from archivebox.core.models import Page, User')
-    print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
-    print()
-    print('[i] Welcome to the ArchiveBox Shell! Example use:')
-    print('    print(Page.objects.filter(is_archived=True).count())')
-    print('    Page.objects.get(url="https://example.com").as_json()')
-
-    print('    Page.objects.get(url="https://example.com").as_json()')
-
-    print('    from archivebox.main import get_invalid_folders')
+    from main import *
+    log_shell_welcome_msg()
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@ -5,16 +5,11 @@ import os
 from typing import Optional, List, Dict, Tuple
 from collections import defaultdict

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, DEVNULL, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
-    DEVNULL,
    is_static_file,
-    ArchiveError,
-    chmod_file,
 )
 from ..config import (
    VERSION,
@ -24,6 +19,7 @@ from ..config import (
    CURL_VERSION,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@ -4,22 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_DOM,
    CHROME_VERSION,
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@ -5,14 +5,8 @@ import os
 from typing import Optional

 from ..index.schema import Link, ArchiveResult, ArchiveOutput
-from ..util import (
-    enforce_types,
-    TimedProgress,
-    domain,
-    run,
-    PIPE,
-    chmod_file,
-)
+from ..system import chmod_file, run, PIPE
+from ..util import enforce_types, domain
 from ..config import (
    TIMEOUT,
    SAVE_FAVICON,
@ -20,6 +14,7 @@ from ..config import (
    CURL_VERSION,
    CHECK_SSL_VALIDITY,
 )
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/git.py
+++ b/archivebox/extractors/git.py
@ -4,15 +4,11 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
-    chmod_file,
    domain,
    extension,
    without_query,
@ -26,6 +22,7 @@ from ..config import (
    GIT_DOMAINS,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@ -4,15 +4,11 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
-    chmod_file,
 )
 from ..config import (
    MEDIA_TIMEOUT,
@ -21,6 +17,7 @@ from ..config import (
    YOUTUBEDL_VERSION,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@ -4,23 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_PDF,
    CHROME_VERSION,
 )
-
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@ -4,22 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_SCREENSHOT,
    CHROME_VERSION,
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -1,14 +1,14 @@
 __package__ = 'archivebox.extractors'

+import re
 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..util import (
    enforce_types,
-    TimedProgress,
    is_static_file,
-    ArchiveError,
-    fetch_page_title,
+    download_url,
+    htmldecode,
 )
 from ..config import (
    TIMEOUT,
@ -16,6 +16,14 @@ from ..config import (
    CURL_BINARY,
    CURL_VERSION,
 )
+from ..cli.logging import TimedProgress
+
+
+HTML_TITLE_REGEX = re.compile(
+    r'<title.*?>'                      # start matching text after <title> tag
+    r'(.[^<>]+)',                      # get everything up to these symbols
+    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
+)


@enforce_types
@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
-        output = fetch_page_title(link.url, timeout=timeout, progress=False)
+        html = download_url(link.url, timeout=timeout)
+        match = re.search(HTML_TITLE_REGEX, html)
+        output = htmldecode(match.group(1).strip()) if match else None
        if not output:
            raise ArchiveError('Unable to detect page title')
    except Exception as err:
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -1,18 +1,22 @@
 __package__ = 'archivebox.extractors'

 import os
+import re

 from typing import Optional
 from datetime import datetime

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
-    wget_output_path,
-    ArchiveError,
+    is_static_file,
+    without_scheme,
+    without_fragment,
+    without_query,
+    path,
+    domain,
+    urldecode,
 )
 from ..config import (
    TIMEOUT,
@ -26,7 +30,7 @@ from ..config import (
    WGET_USER_AGENT,
    COOKIES_FILE,
 )
-
+from ..cli.logging import TimedProgress


@enforce_types
@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
        status=status,
        **timer.stats,
    )
+
+
+@enforce_types
+def wget_output_path(link: Link) -> Optional[str]:
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on wget --adjust-extension (-E)
+    """
+
+    if is_static_file(link.url):
+        return without_scheme(without_fragment(link.url))
+
+    # Wget downloads can save in a number of different ways depending on the url:
+    #    https://example.com
+    #       > example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > example.com/index.html?v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > example.com/index.html?v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc
+    #       > example.com/abc.html
+    #    https://example.com/abc/
+    #       > example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > example.com/abc?v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > example.com/abc/index.html?v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc/test.html
+    #       > example.com/abc/test.html
+    #    https://example.com/abc/test?v=zzVa_tX1OiI
+    #       > example.com/abc/test?v=zzVa_tX1OiI.html
+    #    https://example.com/abc/test/?v=zzVa_tX1OiI
+    #       > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
+
+    # Since the wget algorithm for -E (appending .html) is incredibly complex
+    # and there's no way to get the computed output path from wget
+    # in order to avoid having to reverse-engineer how they calculate it,
+    # we just look in the output folder read the filename wget used from the filesystem
+    full_path = without_fragment(without_query(path(link.url))).strip('/')
+    search_dir = os.path.join(
+        link.link_dir,
+        domain(link.url),
+        urldecode(full_path),
+    )
+
+    for _ in range(4):
+        if os.path.exists(search_dir):
+            if os.path.isdir(search_dir):
+                html_files = [
+                    f for f in os.listdir(search_dir)
+                    if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
+                ]
+                if html_files:
+                    path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
+                    return os.path.join(path_from_link_dir, html_files[0])
+
+        # Move up one directory level
+        search_dir = search_dir.rsplit('/', 1)[0]
+
+        if search_dir == link.link_dir:
+            break
+
+    return None
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -10,12 +10,10 @@ from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from contextlib import contextmanager

-from ..parsers import parse_links
+from ..system import atomic_write
 from ..util import (
    scheme,
    enforce_types,
-    TimedProgress,
-    atomic_write,
    ExtendedEncoder,
 )
 from ..config import (
@ -30,6 +28,7 @@ from ..config import (
    stderr,
 )
 from ..cli.logging import (
+    TimedProgress,
    log_indexing_process_started,
    log_indexing_process_finished,
    log_indexing_started,
@ -278,6 +277,8 @@ def import_new_links(existing_links: List[Link],
                     import_path: str,
                     out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:

+    from ..parsers import parse_links
+
    new_links: List[Link] = []

    # parse and validate the import file
@ -584,9 +585,9 @@ def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], Li
                    else:
                        shutil.move(entry.path, dest)
                        fixed.append(dest)
-
-                if link.link_dir != entry.path:
-                    link = link.overwrite(link_dir=entry.path)
-                    write_json_link_details(link, out_dir=entry.path)
+                        timestamp = entry.path.rsplit('/', 1)[-1]
+                        assert link.link_dir == entry.path
+                        assert link.timestamp == timestamp
+                        write_json_link_details(link, out_dir=entry.path)

    return fixed, cant_fix
--- a/archivebox/index/csv.py
+++ b/archivebox/index/csv.py
@ -0,0 +1,37 @@
+__package__ = 'archivebox.index'
+
+from typing import List, Optional, Any
+
+from ..util import enforce_types
+from .schema import Link
+
+
+@enforce_types
+def links_to_csv(links: List[Link],
+                 cols: Optional[List[str]]=None,
+                 header: bool=True,
+                 separator: str=',',
+                 ljust: int=0) -> str:
+
+    cols = cols or ['timestamp', 'is_archived', 'url']
+    
+    header_str = ''
+    if header:
+        header_str = separator.join(col.ljust(ljust) for col in cols)
+
+    row_strs = (
+        link.to_csv(cols=cols, ljust=ljust, separator=separator)
+        for link in links
+    )
+
+    return '\n'.join((header_str, *row_strs))
+
+
+@enforce_types
+def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
+    from .json import to_json
+
+    return separator.join(
+        to_json(getattr(obj, col), indent=None).ljust(ljust)
+        for col in cols
+    )
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -2,20 +2,18 @@ __package__ = 'archivebox.index'

 import os

+from string import Template
 from datetime import datetime
-from typing import List, Optional, Iterator
+from typing import List, Optional, Iterator, Mapping

 from .schema import Link
+from ..system import atomic_write, copy_and_overwrite
 from ..util import (
    enforce_types,
    ts_to_date,
    urlencode,
    htmlencode,
    urldecode,
-    wget_output_path,
-    render_template,
-    atomic_write,
-    copy_and_overwrite,
 )
 from ..config import (
    OUTPUT_DIR,
@ -67,7 +65,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
 def main_index_template(links: List[Link], finished: bool=True) -> str:
    """render the template for the entire main index"""

-    return render_template(MAIN_INDEX_TEMPLATE, {
+    return render_legacy_template(MAIN_INDEX_TEMPLATE, {
        'version': VERSION,
        'git_sha': GIT_SHA,
        'num_links': str(len(links)),
@ -86,7 +84,9 @@ def main_index_template(links: List[Link], finished: bool=True) -> str:
 def main_index_row_template(link: Link) -> str:
    """render the template for an individual link row of the main index"""

-    return render_template(MAIN_INDEX_ROW_TEMPLATE, {
+    from ..extractors.wget import wget_output_path
+
+    return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
        **link._asdict(extended=True),
        
        # before pages are finished archiving, show loading msg instead of title
@ -122,9 +122,11 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
 def link_details_template(link: Link) -> str:

+    from ..extractors.wget import wget_output_path
+
    link_info = link._asdict(extended=True)

-    return render_template(LINK_DETAILS_TEMPLATE, {
+    return render_legacy_template(LINK_DETAILS_TEMPLATE, {
        **link_info,
        **link_info['canonical'],
        'title': (
@ -142,3 +144,13 @@ def link_details_template(link: Link) -> str:
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date(link.oldest_archive_date),
    })
+
+
+@enforce_types
+def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
+    """render a given html template string with the given template content"""
+
+    # will be replaced by django templates in the future
+    with open(template_path, 'r', encoding='utf-8') as template:
+        template_str = template.read()
+    return Template(template_str).substitute(**context)
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -2,13 +2,14 @@ __package__ = 'archivebox.index'

 import os
 import sys
-import json
+import json as pyjson

 from datetime import datetime
-from typing import List, Optional, Iterator
+from typing import List, Optional, Iterator, Any

 from .schema import Link, ArchiveResult
-from ..util import enforce_types, atomic_write
+from ..system import atomic_write
+from ..util import enforce_types
 from ..config import (
    VERSION,
    OUTPUT_DIR,
@ -46,7 +47,7 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
    index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
-            links = json.load(f)['links']
+            links = pyjson.load(f)['links']
            for link_json in links:
                yield Link.from_json(link_json)

@ -95,12 +96,13 @@ def parse_json_link_details(out_dir: str) -> Optional[Link]:
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
-                link_json = json.load(f)
+                link_json = pyjson.load(f)
                return Link.from_json(link_json)
-            except json.JSONDecodeError:
+            except pyjson.JSONDecodeError:
                pass
    return None

+
@enforce_types
 def parse_json_links_details(out_dir: str) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""
@ -111,3 +113,41 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
                link = parse_json_link_details(entry.path)
                if link:
                    yield link
+
+
+
+### Helpers
+
+class ExtendedEncoder(pyjson.JSONEncoder):
+    """
+    Extended json serializer that supports serializing several model
+    fields and objects
+    """
+
+    def default(self, obj):
+        cls_name = obj.__class__.__name__
+
+        if hasattr(obj, '_asdict'):
+            return obj._asdict()
+
+        elif isinstance(obj, bytes):
+            return obj.decode()
+
+        elif isinstance(obj, datetime):
+            return obj.isoformat()
+
+        elif isinstance(obj, Exception):
+            return '{}: {}'.format(obj.__class__.__name__, obj)
+
+        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
+            return tuple(obj)
+
+        return pyjson.JSONEncoder.default(self, obj)
+
+
+@enforce_types
+def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
+    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
+
+
+
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@ -61,19 +61,20 @@ class ArchiveResult:
        info['end_ts'] = parse_date(info['end_ts'])
        return cls(**info)

-    def to_json(self, indent=4, sort_keys=True):
-        from ..util import to_json
+    def to_dict(self, *keys) -> dict:
+        if keys:
+            return {k: v for k, v in asdict(self).items() if k in keys}
+        return asdict(self)
+
+    def to_json(self, indent=4, sort_keys=True) -> str:
+        from .json import to_json

        return to_json(self, indent=indent, sort_keys=sort_keys)

-    def to_csv(self, cols=None, ljust: int=0, separator: str=','):
-        from ..util import to_json
+    def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
+        from .csv import to_csv

-        cols = cols or self.field_names()
-        return separator.join(
-            to_json(getattr(self, col), indent=None).ljust(ljust)
-            for col in cols
-        )
+        return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
    
    @classmethod
    def field_names(cls):
@ -201,18 +202,15 @@ class Link:
        info['history'] = cast_history
        return cls(**info)

-    def to_json(self, indent=4, sort_keys=True):
-        from ..util import to_json
+    def to_json(self, indent=4, sort_keys=True) -> str:
+        from .json import to_json

        return to_json(self, indent=indent, sort_keys=sort_keys)

-    def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
-        from ..util import to_json
+    def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
+        from .csv import to_csv

-        return separator.join(
-            to_json(getattr(self, col), indent=None).ljust(ljust)
-            for col in csv_cols
-        )
+        return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)

    @classmethod
    def field_names(cls):
@ -354,7 +352,7 @@ class Link:
    def canonical_outputs(self) -> Dict[str, Optional[str]]:
        """predict the expected output paths that should be present after archiving"""

-        from ..util import wget_output_path
+        from ..extractors.wget import wget_output_path
        canonical = {
            'index_path': 'index.html',
            'favicon_path': 'favicon.ico',
@ -382,3 +380,5 @@ class Link:
                'dom_path': static_path,
            })
        return canonical
+
+
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -1,11 +1,10 @@
 __package__ = 'archivebox'

-import re
 import os
 import sys
 import shutil

-from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
+from typing import Dict, List, Optional, Iterable, IO

 from crontab import CronTab, CronSlices

@ -17,18 +16,13 @@ from .cli import (
    main_cmds,
    archive_cmds,
 )
-from .index.schema import Link
-from .util import (
-    enforce_types,
-    TimedProgress,
-    get_dir_size,
-    human_readable_size,
+from .parsers import (
    save_stdin_to_sources,
    save_file_to_sources,
-    links_to_csv,
-    to_json,
-    folders_to_str,
 )
+from .index.schema import Link
+from .util import enforce_types, docstring
+from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
    links_after_timestamp,
    load_main_index,
@ -51,7 +45,11 @@ from .index.json import (
    parse_json_main_index,
    parse_json_links_details,
 )
-from .index.sql import parse_sql_main_index, get_admins, apply_migrations
+from .index.sql import (
+    parse_sql_main_index,
+    get_admins,
+    apply_migrations,
+)
 from .index.html import parse_html_main_index
 from .extractors import archive_link
 from .config import (
@ -91,6 +89,7 @@ from .config import (
    get_real_name,
 )
 from .cli.logging import (
+    TimedProgress,
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
@ -98,6 +97,11 @@ from .cli.logging import (
    log_removal_finished,
    log_list_started,
    log_list_finished,
+    printable_config,
+    printable_folders,
+    printable_filesize,
+    printable_folder_status,
+    printable_dependency_version,
 )


@ -387,7 +391,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
    print(f'    {out_dir}/*')
    num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
-    size = human_readable_size(num_bytes)
+    size = printable_filesize(num_bytes)
    print(f'    Size: {size} across {num_files} files')
    print()

@ -419,7 +423,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
    print(f'    {ARCHIVE_DIR}/*')

    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
-    size = human_readable_size(num_bytes)
+    size = printable_filesize(num_bytes)
    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
    print()

@ -712,13 +716,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
        out_dir=out_dir,
    )
    
-    if csv:
-        print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
-    elif json:
-        print(to_json(folders.values(), indent=4, sort_keys=True))
-    else:
-        print(folders_to_str(folders))
-    raise SystemExit(not folders)
+    print(printable_folders(folders, json=json, csv=csv))
+    return folders


@enforce_types
@ -749,7 +748,7 @@ def list_folders(links: List[Link],
                 status: str,
                 out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    
-    check_data_folder()
+    check_data_folder(out_dir=out_dir)

    if status == 'indexed':
        return get_indexed_folders(links, out_dir=out_dir)
@ -796,7 +795,7 @@ def config(config_options_str: Optional[str]=None,
        )
        raise SystemExit(2)
    elif config_options_str:
-        config_options = stdin_raw_text.split('\n')
+        config_options = config_options_str.split('\n')

    config_options = config_options or []

@ -865,7 +864,6 @@ def config(config_options_str: Optional[str]=None,
        stderr('    Please manually remove the relevant lines from your config file:')
        stderr(f'        {CONFIG_FILE}')
        raise SystemExit(2)
-
    else:
        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
        stderr('    archivebox config')
@ -874,8 +872,6 @@ def config(config_options_str: Optional[str]=None,
        raise SystemExit(2)


-CRON_COMMENT = 'archivebox_schedule'
-
@enforce_types
 def schedule(add: bool=False,
             show: bool=False,
@ -893,7 +889,7 @@ def schedule(add: bool=False,
    os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)

    cron = CronTab(user=True)
-    cron = dedupe_jobs(cron)
+    cron = dedupe_cron_jobs(cron)

    existing_jobs = list(cron.find_comment(CRON_COMMENT))
    if foreground or run_all:
@ -962,7 +958,7 @@ def schedule(add: bool=False,
            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
            raise SystemExit(1)

-        cron = dedupe_jobs(cron)
+        cron = dedupe_cron_jobs(cron)
        cron.write()

        total_runs = sum(j.frequency_per_year() for j in cron)
@ -1025,95 +1021,13 @@ def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:

    execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])

+
+@enforce_types
 def shell(out_dir: str=OUTPUT_DIR) -> None:
+    """Enter an interactive ArchiveBox Django shell"""
+
    check_data_folder(out_dir=out_dir)

    setup_django(OUTPUT_DIR)
    from django.core.management import call_command
    call_command("shell_plus")
-
-# Helpers
-
-def printable_config(config: ConfigDict, prefix: str='') -> str:
-    return f'\n{prefix}'.join(
-        f'{key}={val}'
-        for key, val in config.items()
-        if not (isinstance(val, dict) or callable(val))
-    )
-
-def dedupe_jobs(cron: CronTab) -> CronTab:
-    deduped: Set[Tuple[str, str]] = set()
-
-    for job in list(cron):
-        unique_tuple = (str(job.slices), job.command)
-        if unique_tuple not in deduped:
-            deduped.add(unique_tuple)
-        cron.remove(job)
-
-    for schedule, command in deduped:
-        job = cron.new(command=command, comment=CRON_COMMENT)
-        job.setall(schedule)
-        job.enable()
-
-    return cron
-
-
-def print_folder_status(name, folder):
-    if folder['enabled']:
-        if folder['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
-        else:
-            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
-    else:
-        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
-
-    if folder['path']:
-        if os.path.exists(folder['path']):
-            num_files = (
-                f'{len(os.listdir(folder["path"]))} files'
-                if os.path.isdir(folder['path']) else
-                human_readable_size(os.path.getsize(folder['path']))
-            )
-        else:
-            num_files = 'missing'
-
-        if ' ' in folder['path']:
-            folder['path'] = f'"{folder["path"]}"'
-
-    print(
-        ANSI[color],
-        symbol,
-        ANSI['reset'],
-        name.ljust(22),
-        (folder["path"] or '').ljust(76),
-        num_files.ljust(14),
-        ANSI[color],
-        note,
-        ANSI['reset'],
-    )
-
-
-def print_dependency_version(name, dependency):
-    if dependency['enabled']:
-        if dependency['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
-            version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
-        else:
-            color, symbol, note, version = 'red', 'X', 'invalid', '?'
-    else:
-        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
-
-    if ' ' in dependency["path"]:
-        dependency["path"] = f'"{dependency["path"]}"'
-
-    print(
-        ANSI[color],
-        symbol,
-        ANSI['reset'],
-        name.ljust(22),
-        (dependency["path"] or '').ljust(76),
-        version.ljust(14),
-        ANSI[color],
-        note,
-        ANSI['reset'],
-    )
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -7,16 +7,29 @@ For examples of supported import formats see tests/.

 __package__ = 'archivebox.parsers'

+import re
+import os

 from typing import Tuple, List
+from datetime import datetime

-from ..config import TIMEOUT
-from ..util import (
-    check_url_parsing_invariants,
-    TimedProgress,
-    Link,
-    enforce_types,
+from ..index.schema import Link
+from ..system import atomic_write
+from ..config import (
+    ANSI,
+    OUTPUT_DIR,
+    SOURCES_DIR_NAME,
+    TIMEOUT,
+    check_data_folder,
 )
+from ..util import (
+    basename,
+    domain,
+    download_url,
+    enforce_types,
+    URL_REGEX,
+)
+from ..cli.logging import pretty_path, TimedProgress
 from .pocket_html import parse_pocket_html_export
 from .pinboard_rss import parse_pinboard_rss_export
 from .shaarli_rss import parse_shaarli_rss_export
@ -66,3 +79,95 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:

    timer.end()
    return [], 'Failed to parse'
+
+
+@enforce_types
+def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
+    check_data_folder(out_dir=out_dir)
+
+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+    if not os.path.exists(sources_dir):
+        os.makedirs(sources_dir)
+
+    ts = str(datetime.now().timestamp()).split('.', 1)[0]
+
+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
+
+    atomic_write(raw_text, source_path)
+    return source_path
+
+
+@enforce_types
+def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
+    """download a given url's content into output/sources/domain-<timestamp>.txt"""
+    check_data_folder(out_dir=out_dir)
+
+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+    if not os.path.exists(sources_dir):
+        os.makedirs(sources_dir)
+
+    ts = str(datetime.now().timestamp()).split('.', 1)[0]
+
+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
+
+    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
+        print('{}[*] [{}] Downloading {}{}'.format(
+            ANSI['green'],
+            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            path,
+            ANSI['reset'],
+        ))
+        timer = TimedProgress(timeout, prefix='      ')
+        try:
+            raw_source_text = download_url(path, timeout=timeout)
+            timer.end()
+        except Exception as e:
+            timer.end()
+            print('{}[!] Failed to download {}{}\n'.format(
+                ANSI['red'],
+                path,
+                ANSI['reset'],
+            ))
+            print('    ', e)
+            raise SystemExit(1)
+
+    else:
+        with open(path, 'r') as f:
+            raw_source_text = f.read()
+
+    atomic_write(raw_source_text, source_path)
+
+    print('    > {}'.format(pretty_path(source_path)))
+
+    return source_path
+
+
+def check_url_parsing_invariants() -> None:
+    """Check that plain text regex URL parsing works as expected"""
+
+    # this is last-line-of-defense to make sure the URL_REGEX isn't
+    # misbehaving, as the consequences could be disastrous and lead to many
+    # incorrect/badly parsed links being added to the archive
+
+    test_urls = '''
+    https://example1.com/what/is/happening.html?what=1#how-about-this=1
+    https://example2.com/what/is/happening/?what=1#how-about-this=1
+    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+    https://example4.com/what/is/happening.html
+    https://example5.com/
+    https://example6.com
+
+    <test>http://example7.com</test>
+    [https://example8.com/what/is/this.php?what=1]
+    [and http://example9.com?what=1&other=3#and-thing=2]
+    <what>https://example10.com#and-thing=2 "</about>
+    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
+    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+    example13.bada
+    and example14.badb
+    <or>htt://example15.badc</that>
+    '''
+    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
+    assert len(re.findall(URL_REGEX, test_urls)) == 12
+
--- a/archivebox/system.py
+++ b/archivebox/system.py
@ -0,0 +1,150 @@
+__package__ = 'archivebox'
+
+
+import os
+import shutil
+
+import json as pyjson
+from typing import Optional, Union, Set, Tuple
+
+from crontab import CronTab
+
+from subprocess import (
+    Popen,
+    PIPE,
+    DEVNULL, 
+    CompletedProcess,
+    TimeoutExpired,
+    CalledProcessError,
+)
+
+from .util import enforce_types, ExtendedEncoder
+from .config import OUTPUT_PERMISSIONS
+
+
+def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
+    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
+
+    if input is not None:
+        if 'stdin' in kwargs:
+            raise ValueError('stdin and input arguments may not both be used.')
+        kwargs['stdin'] = PIPE
+
+    if capture_output:
+        if ('stdout' in kwargs) or ('stderr' in kwargs):
+            raise ValueError('stdout and stderr arguments may not be used '
+                             'with capture_output.')
+        kwargs['stdout'] = PIPE
+        kwargs['stderr'] = PIPE
+
+    with Popen(*popenargs, **kwargs) as process:
+        try:
+            stdout, stderr = process.communicate(input, timeout=timeout)
+        except TimeoutExpired:
+            process.kill()
+            try:
+                stdout, stderr = process.communicate(input, timeout=2)
+            except:
+                pass
+            raise TimeoutExpired(popenargs[0][0], timeout)
+        except BaseException:
+            process.kill()
+            # We don't call process.wait() as .__exit__ does that for us.
+            raise 
+        retcode = process.poll()
+        if check and retcode:
+            raise CalledProcessError(retcode, process.args,
+                                     output=stdout, stderr=stderr)
+    return CompletedProcess(process.args, retcode, stdout, stderr)
+
+
+def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
+    """Safe atomic write to filesystem by writing to temp file + atomic rename"""
+    try:
+        tmp_file = '{}.tmp'.format(path)
+        
+        if isinstance(contents, bytes):
+            args = {'mode': 'wb+'}
+        else:
+            args = {'mode': 'w+', 'encoding': 'utf-8'}
+
+        with open(tmp_file, **args) as f:
+            if isinstance(contents, dict):
+                pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
+            else:
+                f.write(contents)
+            
+            os.fsync(f.fileno())
+
+        os.rename(tmp_file, path)
+        chmod_file(path)
+    finally:
+        if os.path.exists(tmp_file):
+            os.remove(tmp_file)
+
+
+@enforce_types
+def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
+    """chmod -R <permissions> <cwd>/<path>"""
+
+    if not os.path.exists(os.path.join(cwd, path)):
+        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
+
+    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
+    if chmod_result.returncode == 1:
+        print('     ', chmod_result.stderr.decode())
+        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
+
+
+@enforce_types
+def copy_and_overwrite(from_path: str, to_path: str):
+    """copy a given file or directory to a given path, overwriting the destination"""
+    if os.path.isdir(from_path):
+        shutil.rmtree(to_path, ignore_errors=True)
+        shutil.copytree(from_path, to_path)
+    else:
+        with open(from_path, 'rb') as src:
+            atomic_write(src.read(), to_path)
+
+
+@enforce_types
+def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
+    """get the total disk size of a given directory, optionally summing up 
+       recursively and limiting to a given filter list
+    """
+    num_bytes, num_dirs, num_files = 0, 0, 0
+    for entry in os.scandir(path):
+        if (pattern is not None) and (pattern not in entry.path):
+            continue
+        if entry.is_dir(follow_symlinks=False):
+            if not recursive:
+                continue
+            num_dirs += 1
+            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+            num_bytes += bytes_inside
+            num_dirs += dirs_inside
+            num_files += files_inside
+        else:
+            num_bytes += entry.stat(follow_symlinks=False).st_size
+            num_files += 1
+    return num_bytes, num_dirs, num_files
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+@enforce_types
+def dedupe_cron_jobs(cron: CronTab) -> CronTab:
+    deduped: Set[Tuple[str, str]] = set()
+
+    for job in list(cron):
+        unique_tuple = (str(job.slices), job.command)
+        if unique_tuple not in deduped:
+            deduped.add(unique_tuple)
+        cron.remove(job)
+
+    for schedule, command in deduped:
+        job = cron.new(command=command, comment=CRON_COMMENT)
+        job.setall(schedule)
+        job.enable()
+
+    return cron
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -1,15 +1,8 @@
-import os
 import re
-import sys
 import ssl
-import json
-import time
-import shutil
-import argparse

-from string import Template
-from json import JSONEncoder
-from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
+
+from typing import List, Optional, Any
 from inspect import signature
 from functools import wraps
 from hashlib import sha256
@ -17,34 +10,17 @@ from urllib.request import Request, urlopen
 from urllib.parse import urlparse, quote, unquote
 from html import escape, unescape
 from datetime import datetime
-from multiprocessing import Process
-from subprocess import (
-    Popen,
-    PIPE,
-    DEVNULL, 
-    CompletedProcess,
-    TimeoutExpired,
-    CalledProcessError,
-)

 from base32_crockford import encode as base32_encode         # type: ignore
+import json as pyjson

-from .index.schema import Link
 from .config import (
-    ANSI,
-    TERM_WIDTH,
-    OUTPUT_DIR,
-    SOURCES_DIR_NAME,
-    OUTPUT_PERMISSIONS,
    TIMEOUT,
-    SHOW_PROGRESS,
-    SAVE_TITLE,
+    STATICFILE_EXTENSIONS,
    CHECK_SSL_VALIDITY,
    WGET_USER_AGENT,
    CHROME_OPTIONS,
-    check_data_folder,
 )
-from .cli.logging import pretty_path

 ### Parsing Helpers

@ -66,6 +42,7 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 without_www = lambda url: url.replace('://www.', '://', 1)
 without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
 hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
+is_static_file = lambda url: extension(url).lower() in STATICFILE_EXTENSIONS  # TODO: the proper way is with MIME type detection, not using extension

 urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
 urldecode = lambda s: s and unquote(s)
@ -85,36 +62,7 @@ URL_REGEX = re.compile(
    r'[^\]\[\(\)<>\""\'\s]+',         # stop parsing at these symbols
    re.IGNORECASE,
 )
-HTML_TITLE_REGEX = re.compile(
-    r'<title.*?>'                      # start matching text after <title> tag
-    r'(.[^<>]+)',                      # get everything up to these symbols
-    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
-)
-STATICFILE_EXTENSIONS = {
-    # 99.999% of the time, URLs ending in these extentions are static files
-    # that can be downloaded as-is, not html pages that need to be rendered
-    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
-    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
-    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 
-    'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
-    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
-    'atom', 'rss', 'css', 'js', 'json',
-    'dmg', 'iso', 'img',
-    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',

-    # Less common extensions to consider adding later
-    # jar, swf, bin, com, exe, dll, deb
-    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, 
-    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
-    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
-
-    # Thse are always treated as pages, not as static files, never add them:
-    # html, htm, shtml, xhtml, xml, aspx, php, cgi
-}
-
-
-
-### Checks & Tests

 def enforce_types(func):
    """
@ -158,189 +106,14 @@ def enforce_types(func):
    return typechecked_function


-def check_url_parsing_invariants() -> None:
-    """Check that plain text regex URL parsing works as expected"""
+def docstring(text: Optional[str]):
+    """attach the given docstring to the decorated function"""
+    def decorator(func):
+        if text:
+            func.__doc__ = text
+        return func
+    return decorator

-    # this is last-line-of-defense to make sure the URL_REGEX isn't
-    # misbehaving, as the consequences could be disastrous and lead to many
-    # incorrect/badly parsed links being added to the archive
-
-    test_urls = '''
-    https://example1.com/what/is/happening.html?what=1#how-about-this=1
-    https://example2.com/what/is/happening/?what=1#how-about-this=1
-    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
-    https://example4.com/what/is/happening.html
-    https://example5.com/
-    https://example6.com
-
-    <test>http://example7.com</test>
-    [https://example8.com/what/is/this.php?what=1]
-    [and http://example9.com?what=1&other=3#and-thing=2]
-    <what>https://example10.com#and-thing=2 "</about>
-    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
-    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
-    example13.bada
-    and example14.badb
-    <or>htt://example15.badc</that>
-    '''
-    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
-    assert len(re.findall(URL_REGEX, test_urls)) == 12
-
-
-### Random Helpers
-
-@enforce_types
-def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
-    check_data_folder(out_dir=out_dir)
-
-    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
-    if not os.path.exists(sources_dir):
-        os.makedirs(sources_dir)
-
-    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
-    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
-
-    atomic_write(raw_text, source_path)
-    return source_path
-
-
-@enforce_types
-def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
-    """download a given url's content into output/sources/domain-<timestamp>.txt"""
-    check_data_folder(out_dir=out_dir)
-
-    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
-    if not os.path.exists(sources_dir):
-        os.makedirs(sources_dir)
-
-    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
-    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
-
-    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
-        print('{}[*] [{}] Downloading {}{}'.format(
-            ANSI['green'],
-            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            path,
-            ANSI['reset'],
-        ))
-        timer = TimedProgress(timeout, prefix='      ')
-        try:
-            raw_source_text = download_url(path, timeout=timeout)
-            timer.end()
-        except Exception as e:
-            timer.end()
-            print('{}[!] Failed to download {}{}\n'.format(
-                ANSI['red'],
-                path,
-                ANSI['reset'],
-            ))
-            print('    ', e)
-            raise SystemExit(1)
-
-    else:
-        with open(path, 'r') as f:
-            raw_source_text = f.read()
-
-    atomic_write(raw_source_text, source_path)
-
-    print('    > {}'.format(pretty_path(source_path)))
-
-    return source_path
-
-
-@enforce_types
-def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
-    """Attempt to guess a page's title by downloading the html"""
-    
-    if not SAVE_TITLE:
-        return None
-
-    try:
-        html = download_url(url, timeout=timeout)
-
-        match = re.search(HTML_TITLE_REGEX, html)
-        return htmldecode(match.group(1).strip()) if match else None
-    except Exception as err:  # noqa
-        # print('[!] Failed to fetch title because of {}: {}'.format(
-        #     err.__class__.__name__,
-        #     err,
-        # ))
-        return None
-
-
-@enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
-    """calculate the path to the wgetted .html file, since wget may
-    adjust some paths to be different than the base_url path.
-
-    See docs on wget --adjust-extension (-E)
-    """
-
-    if is_static_file(link.url):
-        return without_scheme(without_fragment(link.url))
-
-    # Wget downloads can save in a number of different ways depending on the url:
-    #    https://example.com
-    #       > output/archive/<timestamp>/example.com/index.html
-    #    https://example.com?v=zzVa_tX1OiI
-    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
-    #    https://www.example.com/?v=zzVa_tX1OiI
-    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
-
-    #    https://example.com/abc
-    #       > output/archive/<timestamp>/example.com/abc.html
-    #    https://example.com/abc/
-    #       > output/archive/<timestamp>/example.com/abc/index.html
-    #    https://example.com/abc?v=zzVa_tX1OiI.html
-    #       > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
-    #    https://example.com/abc/?v=zzVa_tX1OiI.html
-    #       > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
-
-    #    https://example.com/abc/test.html
-    #       > output/archive/<timestamp>/example.com/abc/test.html
-    #    https://example.com/abc/test?v=zzVa_tX1OiI
-    #       > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
-    #    https://example.com/abc/test/?v=zzVa_tX1OiI
-    #       > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
-
-    # There's also lots of complexity around how the urlencoding and renaming
-    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
-
-    # Since the wget algorithm for -E (appending .html) is incredibly complex
-    # and there's no way to get the computed output path from wget
-    # in order to avoid having to reverse-engineer how they calculate it,
-    # we just look in the output folder read the filename wget used from the filesystem
-    full_path = without_fragment(without_query(path(link.url))).strip('/')
-    search_dir = os.path.join(
-        link.link_dir,
-        domain(link.url),
-        urldecode(full_path),
-    )
-
-    for _ in range(4):
-        if os.path.exists(search_dir):
-            if os.path.isdir(search_dir):
-                html_files = [
-                    f for f in os.listdir(search_dir)
-                    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
-                ]
-                if html_files:
-                    path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
-                    return os.path.join(path_from_link_dir, html_files[0])
-
-        # Move up one directory level
-        search_dir = search_dir.rsplit('/', 1)[0]
-
-        if search_dir == link.link_dir:
-            break
-
-    return None
-
-
-### String Manipulation & Logging Helpers

@enforce_types
 def str_between(string: str, start: str, end: str=None) -> str:
@ -415,122 +188,6 @@ def parse_date(date: Any) -> Optional[datetime]:
    raise ValueError('Tried to parse invalid date! {}'.format(date))


-@enforce_types
-def is_static_file(url: str) -> bool:
-    """Certain URLs just point to a single static file, and 
-       don't need to be re-archived in many formats
-    """
-
-    # TODO: the proper way is with MIME type detection, not using extension
-    return extension(url) in STATICFILE_EXTENSIONS
-
-
-
-### Python / System Helpers
-
-def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
-    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
-
-    if input is not None:
-        if 'stdin' in kwargs:
-            raise ValueError('stdin and input arguments may not both be used.')
-        kwargs['stdin'] = PIPE
-
-    if capture_output:
-        if ('stdout' in kwargs) or ('stderr' in kwargs):
-            raise ValueError('stdout and stderr arguments may not be used '
-                             'with capture_output.')
-        kwargs['stdout'] = PIPE
-        kwargs['stderr'] = PIPE
-
-    with Popen(*popenargs, **kwargs) as process:
-        try:
-            stdout, stderr = process.communicate(input, timeout=timeout)
-        except TimeoutExpired:
-            process.kill()
-            try:
-                stdout, stderr = process.communicate(input, timeout=2)
-            except:
-                pass
-            raise TimeoutExpired(popenargs[0][0], timeout)
-        except BaseException:
-            process.kill()
-            # We don't call process.wait() as .__exit__ does that for us.
-            raise 
-        retcode = process.poll()
-        if check and retcode:
-            raise CalledProcessError(retcode, process.args,
-                                     output=stdout, stderr=stderr)
-    return CompletedProcess(process.args, retcode, stdout, stderr)
-
-
-class TimedProgress:
-    """Show a progress bar and measure elapsed time until .end() is called"""
-
-    def __init__(self, seconds, prefix=''):
-        if SHOW_PROGRESS:
-            self.p = Process(target=progress_bar, args=(seconds, prefix))
-            self.p.start()
-
-        self.stats = {'start_ts': datetime.now(), 'end_ts': None}
-
-    def end(self):
-        """immediately end progress, clear the progressbar line, and save end_ts"""
-
-        end_ts = datetime.now()
-        self.stats['end_ts'] = end_ts
-        if SHOW_PROGRESS:
-            # protect from double termination
-            #if p is None or not hasattr(p, 'kill'):
-            #    return
-            if self.p is not None:
-                self.p.terminate()
-            
-            self.p = None
-
-            sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))  # clear whole terminal line
-
-
-@enforce_types
-def progress_bar(seconds: int, prefix: str='') -> None:
-    """show timer in the form of progress bar, with percentage and seconds remaining"""
-    chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
-    chunks = TERM_WIDTH() - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
-    try:
-        for s in range(seconds * chunks):
-            chunks = TERM_WIDTH() - len(prefix) - 20
-            progress = s / chunks / seconds * 100
-            bar_width = round(progress/(100/chunks))
-
-            # ████████████████████           0.9% (1/60sec)
-            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
-                prefix,
-                ANSI['green'],
-                (chunk * bar_width).ljust(chunks),
-                ANSI['reset'],
-                round(progress, 1),
-                round(s/chunks),
-                seconds,
-            ))
-            sys.stdout.flush()
-            time.sleep(1 / chunks)
-
-        # ██████████████████████████████████ 100.0% (60/60sec)
-        sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
-            prefix,
-            ANSI['red'],
-            chunk * chunks,
-            ANSI['reset'],
-            100.0,
-            seconds,
-            seconds,
-        ))
-        sys.stdout.flush()
-    except KeyboardInterrupt:
-        print()
-        pass
-
-
@enforce_types
 def download_url(url: str, timeout: int=TIMEOUT) -> str:
    """Download the contents of a remote url and return the text"""
@ -547,58 +204,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
    return resp.read().decode(encoding)


-@enforce_types
-def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
-    """chmod -R <permissions> <cwd>/<path>"""
-
-    if not os.path.exists(os.path.join(cwd, path)):
-        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
-
-    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
-    if chmod_result.returncode == 1:
-        print('     ', chmod_result.stderr.decode())
-        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
-
-
-@enforce_types
-def copy_and_overwrite(from_path: str, to_path: str):
-    if os.path.isdir(from_path):
-        shutil.rmtree(to_path, ignore_errors=True)
-        shutil.copytree(from_path, to_path)
-    else:
-        with open(from_path, 'rb') as src:
-            atomic_write(src.read(), to_path)
-
-
-@enforce_types
-def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
-    num_bytes, num_dirs, num_files = 0, 0, 0
-    for entry in os.scandir(path):
-        if (pattern is not None) and (pattern not in entry.path):
-            continue
-        if entry.is_dir(follow_symlinks=False):
-            if not recursive:
-                continue
-            num_dirs += 1
-            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
-            num_bytes += bytes_inside
-            num_dirs += dirs_inside
-            num_files += files_inside
-        else:
-            num_bytes += entry.stat(follow_symlinks=False).st_size
-            num_files += 1
-    return num_bytes, num_dirs, num_files
-
-
-@enforce_types
-def human_readable_size(num_bytes: Union[int, float]) -> str:
-    for count in ['Bytes','KB','MB','GB']:
-        if num_bytes > -1024.0 and num_bytes < 1024.0:
-            return '%3.1f %s' % (num_bytes, count)
-        num_bytes /= 1024.0
-    return '%3.1f %s' % (num_bytes, 'TB')
-
-
@enforce_types
 def chrome_args(**options) -> List[str]:
    """helper to build up a chrome shell command with arguments"""
@ -632,7 +237,7 @@ def chrome_args(**options) -> List[str]:
    return cmd_args


-class ExtendedEncoder(JSONEncoder):
+class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
    fields and objects
@ -656,114 +261,5 @@ class ExtendedEncoder(JSONEncoder):
        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
            return tuple(obj)

-        return JSONEncoder.default(self, obj)
+        return pyjson.JSONEncoder.default(self, obj)

-
-def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
-    if file:
-        path = os.path.realpath(file.name)
-        contents = json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
-        atomic_write(contents, path)
-        return contents
-    else:
-        return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
-
-
-def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
-           header: bool=True, ljust: int=0, separator: str=',') -> str:
-    csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
-    
-    header_str = ''
-    if header:
-        header_str = separator.join(col.ljust(ljust) for col in csv_cols)
-
-    row_strs = (
-        link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator)
-        for link in links
-    )
-
-    return '\n'.join((header_str, *row_strs))
-
-def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
-    return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
-
-@enforce_types
-def render_template(template_path: str, context: Mapping[str, str]) -> str:
-    """render a given html template string with the given template content"""
-
-    # will be replaced by django templates in the future
-    with open(template_path, 'r', encoding='utf-8') as template:
-        template_str = template.read()
-    return Template(template_str).substitute(**context)
-
-
-def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
-    """Safe atomic write to filesystem by writing to temp file + atomic rename"""
-    try:
-        tmp_file = '{}.tmp'.format(path)
-        
-        if isinstance(contents, bytes):
-            args = {'mode': 'wb+'}
-        else:
-            args = {'mode': 'w+', 'encoding': 'utf-8'}
-
-        with open(tmp_file, **args) as f:
-            if isinstance(contents, dict):
-                to_json(contents, file=f)
-            else:
-                f.write(contents)
-            
-            os.fsync(f.fileno())
-
-        os.rename(tmp_file, path)
-        chmod_file(path)
-    finally:
-        if os.path.exists(tmp_file):
-            os.remove(tmp_file)
-
-
-def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
-    """Tell the user they passed stdin to a command that doesn't accept it"""
-
-    if stdin and not stdin.isatty():
-        stdin_raw_text = stdin.read().strip()
-        if stdin_raw_text:
-            print(
-                '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
-                    caller,
-                    **ANSI,
-                )
-            )
-            print('    Run archivebox "{} --help" to see usage and examples.'.format(
-                caller,
-            ))
-            print()
-            raise SystemExit(1)
-
-def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
-    if stdin and not stdin.isatty():
-        return stdin.read()
-    return None
-
-
-def set_docstring(text: str):
-    def decorator(func):
-        @wraps(func)
-        def wrapper_with_docstring(*args, **kwargs):
-            return func(*args, **kwargs)
-        wrapper_with_docstring.__doc__ = text
-        return wrapper_with_docstring
-    return decorator
-
-
-class SmartFormatter(argparse.HelpFormatter):
-    def _split_lines(self, text, width):
-        if '\n' in text:
-            return text.splitlines()
-        return argparse.HelpFormatter._split_lines(self, text, width)
-
-
-class ArchiveError(Exception):
-    def __init__(self, message, hints=None):
-        super().__init__(message)
-        self.hints = hints