Merge branch 'dev' into search_index_extract_html_text

2025-06-01 07:18:27 -04:00 · 2023-10-27 23:09:28 -07:00 · 2023-10-27 23:09:28 -07:00 · a680724367
commit a680724367
parent 310b4d1242 720061185c
29 changed files with 3230 additions and 1654 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -90,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
        'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
        'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
-        'URL_BLACKLIST':            {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'},  # to avoid downloading code assets as their own pages
-        'URL_WHITELIST':            {'type': str,   'default': None},
+
+        'URL_DENYLIST':             {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)},  # to avoid downloading code assets as their own pages
+        'URL_ALLOWLIST':            {'type': str,   'default': None, 'aliases': ('URL_WHITELIST',)},
+
+        'ADMIN_USERNAME':           {'type': str,   'default': None},
+        'ADMIN_PASSWORD':           {'type': str,   'default': None},
+
        'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
        'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
    },
@ -143,6 +148,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
        'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
        'SAVE_ARCHIVE_DOT_ORG':     {'type': bool,  'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
+        'SAVE_ALLOWLIST':           {'type': dict,  'default': {},},
+        'SAVE_DENYLIST':            {'type': dict,  'default': {},},
    },

    'ARCHIVE_METHOD_OPTIONS': {
@ -231,12 +238,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {

        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'WGET_BINARY':              {'type': str,   'default': 'wget'},
+        'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
-        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('mercury-parser')},
-        #'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},
+        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
+        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
        'CHROME_BINARY':            {'type': str,   'default': None},
@ -374,6 +380,8 @@ def get_commit_hash(config):
 ############################## Derived Config ##################################


+ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
+
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
    'USER':                     {'default': lambda c: SYSTEM_USER},
@ -390,8 +398,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
-    'URL_BLACKLIST_PTN':        {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
-    'URL_WHITELIST_PTN':        {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
+    'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
+    'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
    'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},

    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@ -435,7 +443,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},

    'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
-    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury is unversioned
+    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750

    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -465,10 +473,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
    'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
    'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
+    'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
+    'SAVE_DENYLIST_PTN':       {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
 }


-
 ################################### Helpers ####################################


--- a/archivebox/config_stubs.py
+++ b/archivebox/config_stubs.py
@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False):
    MEDIA_TIMEOUT: int
    OUTPUT_PERMISSIONS: str
    RESTRICT_FILE_NAMES: str
-    URL_BLACKLIST: str
+    URL_DENYLIST: str

    SECRET_KEY: Optional[str]
    BIND_ADDR: str
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@ -41,7 +41,7 @@ class AddLinkForm(forms.Form):
    #     label="Exclude patterns",
    #     min_length='1',
    #     required=False,
-    #     initial=URL_BLACKLIST,
+    #     initial=URL_DENYLIST,
    # )
    # timeout = forms.IntegerField(
    #     initial=TIMEOUT,
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -6,9 +6,6 @@ import re
 import logging
 import tempfile

-import ldap
-from django_auth_ldap.config import LDAPSearch
-
 from pathlib import Path
 from django.utils.crypto import get_random_string

@ -97,33 +94,43 @@ AUTHENTICATION_BACKENDS = [
 ]

 if LDAP:
-    global AUTH_LDAP_SERVER_URI
-    AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
+    try:
+        import ldap
+        from django_auth_ldap.config import LDAPSearch

-    global AUTH_LDAP_BIND_DN
-    AUTH_LDAP_BIND_DN = LDAP_BIND_DN
+        global AUTH_LDAP_SERVER_URI
+        global AUTH_LDAP_BIND_DN
+        global AUTH_LDAP_BIND_PASSWORD
+        global AUTH_LDAP_USER_SEARCH
+        global AUTH_LDAP_USER_ATTR_MAP

-    global AUTH_LDAP_BIND_PASSWORD
-    AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
+        AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
+        AUTH_LDAP_BIND_DN = LDAP_BIND_DN
+        AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD

-    global AUTH_LDAP_USER_SEARCH
-    AUTH_LDAP_USER_SEARCH = LDAPSearch(
-        LDAP_USER_BASE,
-        ldap.SCOPE_SUBTREE,
-        '(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
-    )
+        assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'

-    global AUTH_LDAP_USER_ATTR_MAP
-    AUTH_LDAP_USER_ATTR_MAP = {
-        'username': LDAP_USERNAME_ATTR,
-        'first_name': LDAP_FIRSTNAME_ATTR,
-        'last_name': LDAP_LASTNAME_ATTR,
-        'email': LDAP_EMAIL_ATTR,
-    }
+        AUTH_LDAP_USER_SEARCH = LDAPSearch(
+            LDAP_USER_BASE,
+            ldap.SCOPE_SUBTREE,
+            '(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
+        )
+
+        AUTH_LDAP_USER_ATTR_MAP = {
+            'username': LDAP_USERNAME_ATTR,
+            'first_name': LDAP_FIRSTNAME_ATTR,
+            'last_name': LDAP_LASTNAME_ATTR,
+            'email': LDAP_EMAIL_ATTR,
+        }
+
+        AUTHENTICATION_BACKENDS = [
+            'django_auth_ldap.backend.LDAPBackend',
+        ]
+    except ModuleNotFoundError:
+        sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
+        # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
+        # sys.exit(1)

-    AUTHENTICATION_BACKENDS = [
-        'django_auth_ldap.backend.LDAPBackend',
-    ]

 ################################################################################
 ### Debug Settings
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -4,12 +4,16 @@ import os
 import sys
 from pathlib import Path

-from typing import Optional, List, Iterable, Union
+from typing import Callable, Optional, List, Iterable, Union
 from datetime import datetime, timezone
 from django.db.models import QuerySet

+from ..config import (
+    SAVE_ALLOWLIST_PTN,
+    SAVE_DENYLIST_PTN,
+)
 from ..core.settings import ERROR_LOG
-from ..index.schema import Link
+from ..index.schema import ArchiveResult, Link
 from ..index.sql import write_link_to_sql_index
 from ..index import (
    load_link_details,
@ -43,7 +47,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers


-def get_default_archive_methods():
+ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
+SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
+ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
+
+def get_default_archive_methods() -> List[ArchiveMethodEntry]:
    return [
        ('favicon', should_save_favicon, save_favicon),
        ('headers', should_save_headers, save_headers),
@ -71,12 +79,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
    ('wget', 6)
 ]

+
@enforce_types
-def ignore_methods(to_ignore: List[str]):
+def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
+    DEFAULT_METHODS = get_default_archive_methods()
+    allowed_methods = {
+        m for pat, methods in
+        SAVE_ALLOWLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    } or { m[0] for m in DEFAULT_METHODS }
+    denied_methods = {
+        m for pat, methods in
+        SAVE_DENYLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    }
+    allowed_methods -= denied_methods
+
+    return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
+
+@enforce_types
+def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
    ARCHIVE_METHODS = get_default_archive_methods()
-    methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
-    methods = map(lambda x: x[0], methods)
-    return list(methods)
+    return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]

@enforce_types
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@ -89,11 +115,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
    except Snapshot.DoesNotExist:
        snapshot = write_link_to_sql_index(link)

-    ARCHIVE_METHODS = get_default_archive_methods()
+    active_methods = get_archive_methods_for_link(link)
    
    if methods:
-        ARCHIVE_METHODS = [
-            method for method in ARCHIVE_METHODS
+        active_methods = [
+            method for method in active_methods
            if method[0] in methods
        ]

@ -110,7 +136,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
        start_ts = datetime.now(timezone.utc)

-        for method_name, should_run, method_function in ARCHIVE_METHODS:
+        for method_name, should_run, method_function in active_methods:
            try:
                if method_name not in link.history:
                    link.history[method_name] = []
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        result = run(cmd, cwd=out_dir, timeout=timeout)
        try:
            result_json = json.loads(result.stdout)
-            assert result_json and 'content' in result_json
+            assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
        except json.JSONDecodeError:
            raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)

@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
            if line.strip()
        ]
        hints = (
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -22,8 +22,8 @@ from ..config import (
    JSON_INDEX_FILENAME,
    OUTPUT_DIR,
    TIMEOUT,
-    URL_BLACKLIST_PTN,
-    URL_WHITELIST_PTN,
+    URL_DENYLIST_PTN,
+    URL_ALLOWLIST_PTN,
    stderr,
    OUTPUT_PERMISSIONS
 )
@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
            continue
        if scheme(link.url) not in ('http', 'https', 'ftp'):
            continue
-        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+        if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
            continue
-        if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
+        if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
            continue

        yield link
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -533,11 +533,27 @@ def log_shell_welcome_msg():
 ### Helpers

@enforce_types
-def pretty_path(path: Union[Path, str]) -> str:
+def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
-    pwd = Path('.').resolve()
-    # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
-    return str(path).replace(str(pwd) + '/', './')
+    pwd = str(Path(pwd))  # .resolve()
+    path = str(path)
+
+    if not path:
+        return path
+
+    # replace long absolute paths with ./ relative ones to save on terminal output width
+    if path.startswith(pwd) and (pwd != '/'):
+        path = path.replace(pwd, '.', 1)
+    
+    # quote paths containing spaces
+    if ' ' in path:
+        path = f'"{path}"'
+
+    # if path is just a plain dot, replace it back with the absolute path for clarity
+    if path == '.':
+        path = pwd
+
+    return path


@enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
    else:
        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'

+
    if folder['path']:
        if Path(folder['path']).exists():
            num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
        # add symbol @ next to filecount if path is a remote filesystem mount
        num_files = f'{num_files} @' if num_files else '@'

-    path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
-    if path and ' ' in path:
-        path = f'"{path}"'
-
-    # if path is just a plain dot, replace it back with the full path for clarity
-    if path == '.':
-        path = str(OUTPUT_DIR)
+    path = pretty_path(folder['path'])

    return ' '.join((
        ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
    else:
        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

-    path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
-    if path and ' ' in path:
-        path = f'"{path}"'
+    path = pretty_path(dependency['path'])

    return ' '.join((
        ANSI[color],
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -112,6 +112,8 @@ from .config import (
    load_all_config,
    CONFIG,
    USER_CONFIG,
+    ADMIN_USERNAME,
+    ADMIN_PASSWORD,
    get_real_name,
    setup_django,
 )
@ -216,7 +218,7 @@ def version(quiet: bool=False,
    if not quiet:
        # 0.6.3
        # ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
-        # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
+        # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
        
        p = platform.uname()
        print(
@ -236,7 +238,8 @@ def version(quiet: bool=False,
            #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
            f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
-            f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
+            f'FS_USER={PUID}:{PGID}',
+            f'FS_PERMS={OUTPUT_PERMISSIONS}',
            f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
        )
        print()
@ -251,19 +254,19 @@ def version(quiet: bool=False,
        
        print()
        print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
-        for name, folder in CODE_LOCATIONS.items():
-            print(printable_folder_status(name, folder))
+        for name, path in CODE_LOCATIONS.items():
+            print(printable_folder_status(name, path))

        print()
        print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
-        for name, folder in EXTERNAL_LOCATIONS.items():
-            print(printable_folder_status(name, folder))
+        for name, path in EXTERNAL_LOCATIONS.items():
+            print(printable_folder_status(name, path))

        print()
        if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
            print('{white}[i] Data locations:{reset}'.format(**ANSI))
-            for name, folder in DATA_LOCATIONS.items():
-                print(printable_folder_status(name, folder))
+            for name, path in DATA_LOCATIONS.items():
+                print(printable_folder_status(name, path))
        else:
            print()
            print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
        write_main_index(list(pending_links.values()), out_dir=out_dir)

    print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
+
+    from django.contrib.auth.models import User
+
+    if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
+        print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
+        User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
+
    if existing_index:
        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
    else:
-        # TODO: allow creating new supersuer via env vars on first init
-        # if config.HTTP_USER and config.HTTP_PASS:
-        #     from django.contrib.auth.models import User
-        #     User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
-
        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))

    json_index = out_dir / JSON_INDEX_FILENAME
--- a/archivebox/templates/admin/private_index.html
+++ b/archivebox/templates/admin/private_index.html
@ -1,62 +1,3 @@
-{% extends "base.html" %}
-{% load static %}
-
-{% block body %}
-    <div id="toolbar">
-        <form id="changelist-search" action="{% url 'public-index' %}" method="get">
-            <div>
-                <label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
-                <input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
-                <input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
-                <input type="button"
-                       value="♺"
-                       title="Refresh..."
-                       onclick="location.href='{% url 'public-index' %}'"
-                       style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
-                </input>
-            </div>
-        </form>
-    </div>
-    <table id="table-bookmarks">
-        <thead>
-            <tr>
-                <th style="width: 100px;">Bookmarked</th>
-                <th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
-                <th style="width: 140px">Files</th>
-                <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
-            </tr>
-        </thead>
-            <tbody>
-                {% for link in object_list %}
-                    {% include 'main_index_row.html' with link=link  %}
-                {% endfor %}
-            </tbody>
-        </table>
-        <center>
-            <span class="step-links">
-                {% if page_obj.has_previous %}
-                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a>
-                    <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
-                {% endif %}
-        
-                <span class="current">
-                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
-                </span>
-        
-                {% if page_obj.has_next %}
-                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                    <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-                {% endif %}
-            </span>
-    
-            {% if page_obj.has_next %}
-                <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-            {% endif %}
-        </span>
-        <br>
-    </center>
-{% endblock %}
 {% extends "admin/base_site.html" %}
 {% load i18n admin_urls static admin_list %}
 {% load core_tags %}
--- a/archivebox/templates/core/add.html
+++ b/archivebox/templates/core/add.html
@ -33,7 +33,7 @@
                    <br/>
                    <div class="loader"></div>
                    <br/>
-                    Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
+                    Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
                </center>
            </div>
            <form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
            </form>
            <br/><br/><br/>
            <center id="delay-warning" style="display: none">
-                <small>(it's safe to leave this page, adding will continue in the background)</small>
+                <small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
            </center>
            {% if absolute_add_path %}
-            <center id="bookmarklet">
+            <!-- <center id="bookmarklet">
              <p>Bookmark this link to quickly add to your archive:
                <a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
-            </center>
+            </center> -->
            {% endif %}
            <script>
                document.getElementById('add-form').addEventListener('submit', function(event) {
                    document.getElementById('in-progress').style.display = 'block'
                    document.getElementById('add-form').style.display = 'none'
                    document.getElementById('delay-warning').style.display = 'block'
+                    setTimeout(function() {
+                        window.location = '/'
+                    }, 2000)
                    return true
                })
            </script>