Merge branch 'dev' into method_allow_deny

2025-05-18 00:54:26 -04:00 · 2023-10-20 04:25:44 -07:00 · 2023-10-20 04:25:44 -07:00 · 63ad43f46c
commit 63ad43f46c
parent 2076474252 a58535baff
33 changed files with 4485 additions and 1748 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -57,9 +57,17 @@ SYSTEM_USER = getpass.getuser() or os.getlogin()
 try:
    import pwd
    SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
+except KeyError:
+    # Process' UID might not map to a user in cases such as running the Docker image
+    # (where `archivebox` is 999) as a different UID.
+    pass
 except ModuleNotFoundError:
    # pwd is only needed for some linux systems, doesn't exist on windows
    pass
+except Exception:
+    # this should never happen, uncomment to debug
+    # raise
+    pass

 ############################### Config Schema ##################################

@ -82,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
        'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
        'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
+
        'URL_DENYLIST':             {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)},  # to avoid downloading code assets as their own pages
        'URL_ALLOWLIST':            {'type': str,   'default': None, 'aliases': ('URL_WHITELIST',)},
+
+        'ADMIN_USERNAME':           {'type': str,   'default': None},
+        'ADMIN_PASSWORD':           {'type': str,   'default': None},
+
        'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
        'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
    },
@ -100,12 +113,22 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 40},
        'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
        'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
-        'TIMEZONE':                 {'type': str,   'default': 'UTC'},
+        'TIMEZONE':                  {'type': str,   'default': 'UTC'},
        'REVERSE_PROXY_USER_HEADER': {'type': str,   'default': 'Remote-User'},
        'REVERSE_PROXY_WHITELIST':   {'type': str,   'default': ''},
        'LOGOUT_REDIRECT_URL':       {'type': str,   'default': '/'},
-        'PREVIEW_ORIGINALS':        {'type': bool,  'default': True},
-        'LOGOUT_REDIRECT_URL':   {'type': str,   'default': '/'},
+        'PREVIEW_ORIGINALS':         {'type': bool,  'default': True},
+
+        'LDAP':                      {'type': bool,  'default': False},
+        'LDAP_SERVER_URI':           {'type': str,   'default': None},
+        'LDAP_BIND_DN':              {'type': str,   'default': None},
+        'LDAP_BIND_PASSWORD':        {'type': str,   'default': None},
+        'LDAP_USER_BASE':            {'type': str,   'default': None},
+        'LDAP_USER_FILTER':          {'type': str,   'default': None},
+        'LDAP_USERNAME_ATTR':        {'type': str,   'default': None},
+        'LDAP_FIRSTNAME_ATTR':       {'type': str,   'default': None},
+        'LDAP_LASTNAME_ATTR':        {'type': str,   'default': None},
+        'LDAP_EMAIL_ATTR':           {'type': str,   'default': None},
    },

    'ARCHIVE_METHOD_TOGGLES': {
@ -151,10 +174,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--write-thumbnail',
                                                                '--no-call-home',
                                                                '--write-sub',
-                                                                '--all-subs',
-                                                                # There are too many of these and youtube
-                                                                # throttles you with HTTP error 429
-                                                                #'--write-auto-subs',
+                                                                '--write-auto-subs',
                                                                '--convert-subs=srt',
                                                                '--yes-playlist',
                                                                '--continue',
@ -167,7 +187,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--ignore-errors',
                                                                '--geo-bypass',
                                                                '--add-metadata',
-                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
+                                                                '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
                                                                ]},


@ -216,18 +236,19 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {

        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'WGET_BINARY':              {'type': str,   'default': 'wget'},
+        'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
-        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('mercury-parser')},
-        #'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},
+        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
+        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
        'CHROME_BINARY':            {'type': str,   'default': None},

        'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
        'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
+
+        'READWISE_READER_TOKENS':     {'type': dict,  'default': {}},
    },
 }

@ -420,7 +441,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},

    'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
-    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury is unversioned
+    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750

    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -20,6 +20,17 @@ from ..config import (
    OUTPUT_DIR,
    LOGS_DIR,
    TIMEZONE,
+
+    LDAP,
+    LDAP_SERVER_URI,
+    LDAP_BIND_DN,
+    LDAP_BIND_PASSWORD,
+    LDAP_USER_BASE,
+    LDAP_USER_FILTER,
+    LDAP_USERNAME_ATTR,
+    LDAP_FIRSTNAME_ATTR,
+    LDAP_LASTNAME_ATTR,
+    LDAP_EMAIL_ATTR,
 )

 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@ -55,6 +66,12 @@ INSTALLED_APPS = [
 ]


+# For usage with https://www.jetadmin.io/integrations/django
+# INSTALLED_APPS += ['jet_django']
+# JET_PROJECT = 'archivebox'
+# JET_TOKEN = 'some-api-token-here'
+
+
 MIDDLEWARE = [
    'core.middleware.TimezoneMiddleware',
    'django.middleware.security.SecurityMiddleware',
@ -67,11 +84,58 @@ MIDDLEWARE = [
    'core.middleware.CacheControlMiddleware',
 ]

+################################################################################
+### Authentication Settings
+################################################################################
+
 AUTHENTICATION_BACKENDS = [
    'django.contrib.auth.backends.RemoteUserBackend',
    'django.contrib.auth.backends.ModelBackend',
 ]

+if LDAP:
+    try:
+        import ldap
+        from django_auth_ldap.config import LDAPSearch
+
+        global AUTH_LDAP_SERVER_URI
+        global AUTH_LDAP_BIND_DN
+        global AUTH_LDAP_BIND_PASSWORD
+        global AUTH_LDAP_USER_SEARCH
+        global AUTH_LDAP_USER_ATTR_MAP
+
+        AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
+        AUTH_LDAP_BIND_DN = LDAP_BIND_DN
+        AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
+
+        assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
+
+        AUTH_LDAP_USER_SEARCH = LDAPSearch(
+            LDAP_USER_BASE,
+            ldap.SCOPE_SUBTREE,
+            '(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
+        )
+
+        AUTH_LDAP_USER_ATTR_MAP = {
+            'username': LDAP_USERNAME_ATTR,
+            'first_name': LDAP_FIRSTNAME_ATTR,
+            'last_name': LDAP_LASTNAME_ATTR,
+            'email': LDAP_EMAIL_ATTR,
+        }
+
+        AUTHENTICATION_BACKENDS = [
+            'django_auth_ldap.backend.LDAPBackend',
+        ]
+    except ModuleNotFoundError:
+        sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
+        # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
+        # sys.exit(1)
+
+
+################################################################################
+### Debug Settings
+################################################################################
+
 # only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
 DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
 if DEBUG_TOOLBAR:
@ -267,8 +331,8 @@ class NoisyRequestsFilter(logging.Filter):
 if LOGS_DIR.exists():
    ERROR_LOG = (LOGS_DIR / 'errors.log')
 else:
-    # meh too many edge cases here around creating log dir w/ correct permissions
-    # cant be bothered, just trash the log and let them figure it out via stdout/stderr
+    # historically too many edge cases here around creating log dir w/ correct permissions early on
+    # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
    ERROR_LOG = tempfile.NamedTemporaryFile().name

 LOGGING = {
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@ -33,6 +33,9 @@ urlpatterns = [
    path('admin/', admin.site.urls),

    path('health/', HealthCheckView.as_view(), name='healthcheck'),
+    path('error/', lambda _: 1/0),
+
+    # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django

    path('index.html', RedirectView.as_view(url='/')),
    path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@ -9,6 +9,7 @@ from ..util import (
    enforce_types,
    is_static_file,
    chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
    TIMEOUT,
@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    except Exception as err:
        status = 'failed'
        output = err
+        chrome_cleanup()
    finally:
        timer.end()

--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@ -9,6 +9,7 @@ from ..util import (
    enforce_types,
    is_static_file,
    chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
    TIMEOUT,
@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    except Exception as err:
        status = 'failed'
        output = err
+        chrome_cleanup()
    finally:
        timer.end()

--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        result = run(cmd, cwd=out_dir, timeout=timeout)
        try:
            result_json = json.loads(result.stdout)
-            assert result_json and 'content' in result_json
+            assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
        except json.JSONDecodeError:
            raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)

@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
            if line.strip()
        ]
        hints = (
--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@ -9,6 +9,7 @@ from ..util import (
    enforce_types,
    is_static_file,
    chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
    TIMEOUT,
@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    except Exception as err:
        status = 'failed'
        output = err
+        chrome_cleanup()
    finally:
        timer.end()

--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -26,7 +26,7 @@ from ..logging_util import TimedProgress

 HTML_TITLE_REGEX = re.compile(
    r'<title.*?>'                      # start matching text after <title> tag
-    r'(.[^<>]+)',                      # get everything up to these symbols
+    r'([^<>]+)',                      # get everything up to these symbols
    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
 )

--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -177,7 +177,7 @@ def snapshot_icons(snapshot) -> str:
                # The check for archive_org is different, so it has to be handled separately

                # get from db (faster)
-                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # get from filesystem (slower)
                # target_path = Path(path) / "archive.org.txt"
                # exists = target_path.exists()
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):

            hints = (
                '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
-                for line in hints[:5] if line.strip()
+                for line in list(hints)[:5] if line.strip()
            )


@ -533,11 +533,27 @@ def log_shell_welcome_msg():
 ### Helpers

@enforce_types
-def pretty_path(path: Union[Path, str]) -> str:
+def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
-    pwd = Path('.').resolve()
-    # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
-    return str(path).replace(str(pwd) + '/', './')
+    pwd = str(Path(pwd))  # .resolve()
+    path = str(path)
+
+    if not path:
+        return path
+
+    # replace long absolute paths with ./ relative ones to save on terminal output width
+    if path.startswith(pwd) and (pwd != '/'):
+        path = path.replace(pwd, '.', 1)
+    
+    # quote paths containing spaces
+    if ' ' in path:
+        path = f'"{path}"'
+
+    # if path is just a plain dot, replace it back with the absolute path for clarity
+    if path == '.':
+        path = pwd
+
+    return path


@enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
    else:
        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'

+
    if folder['path']:
        if Path(folder['path']).exists():
            num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
        # add symbol @ next to filecount if path is a remote filesystem mount
        num_files = f'{num_files} @' if num_files else '@'

-    path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
-    if path and ' ' in path:
-        path = f'"{path}"'
-
-    # if path is just a plain dot, replace it back with the full path for clarity
-    if path == '.':
-        path = str(OUTPUT_DIR)
+    path = pretty_path(folder['path'])

    return ' '.join((
        ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
    else:
        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

-    path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
-    if path and ' ' in path:
-        path = f'"{path}"'
+    path = pretty_path(dependency['path'])

    return ' '.join((
        ANSI[color],
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -112,6 +112,8 @@ from .config import (
    load_all_config,
    CONFIG,
    USER_CONFIG,
+    ADMIN_USERNAME,
+    ADMIN_PASSWORD,
    get_real_name,
    setup_django,
 )
@ -216,7 +218,7 @@ def version(quiet: bool=False,
    if not quiet:
        # 0.6.3
        # ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
-        # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
+        # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
        
        p = platform.uname()
        print(
@ -236,7 +238,8 @@ def version(quiet: bool=False,
            #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
            f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
-            f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
+            f'FS_USER={PUID}:{PGID}',
+            f'FS_PERMS={OUTPUT_PERMISSIONS}',
            f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
        )
        print()
@ -251,19 +254,19 @@ def version(quiet: bool=False,
        
        print()
        print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
-        for name, folder in CODE_LOCATIONS.items():
-            print(printable_folder_status(name, folder))
+        for name, path in CODE_LOCATIONS.items():
+            print(printable_folder_status(name, path))

        print()
        print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
-        for name, folder in EXTERNAL_LOCATIONS.items():
-            print(printable_folder_status(name, folder))
+        for name, path in EXTERNAL_LOCATIONS.items():
+            print(printable_folder_status(name, path))

        print()
        if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
            print('{white}[i] Data locations:{reset}'.format(**ANSI))
-            for name, folder in DATA_LOCATIONS.items():
-                print(printable_folder_status(name, folder))
+            for name, path in DATA_LOCATIONS.items():
+                print(printable_folder_status(name, path))
        else:
            print()
            print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
        write_main_index(list(pending_links.values()), out_dir=out_dir)

    print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
+
+    from django.contrib.auth.models import User
+
+    if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
+        print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
+        User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
+
    if existing_index:
        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
    else:
-        # TODO: allow creating new supersuer via env vars on first init
-        # if config.HTTP_USER and config.HTTP_PASS:
-        #     from django.contrib.auth.models import User
-        #     User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
-
        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))

    json_index = out_dir / JSON_INDEX_FILENAME
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -34,6 +34,7 @@ from ..index.schema import Link
 from ..logging_util import TimedProgress, log_source_saved

 from . import pocket_api
+from . import readwise_reader_api
 from . import wallabag_atom
 from . import pocket_html
 from . import pinboard_rss
@ -51,6 +52,7 @@ from . import url_list
 PARSERS = {
    # Specialized parsers
    pocket_api.KEY:     (pocket_api.NAME,       pocket_api.PARSER),
+    readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
    wallabag_atom.KEY:  (wallabag_atom.NAME,    wallabag_atom.PARSER),
    pocket_html.KEY:    (pocket_html.NAME,      pocket_html.PARSER),
    pinboard_rss.KEY:   (pinboard_rss.NAME,     pinboard_rss.PARSER),
@ -233,6 +235,10 @@ _test_url_strs = {
    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
    '<test>http://example7.com</test>': 1,
+    'https://<test>': 0,
+    'https://[test]': 0,
+    'http://"test"': 0,
+    'http://\'test\'': 0,
    '[https://example8.com/what/is/this.php?what=1]': 1,
    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
    '<what>https://example10.com#and-thing=2 "</about>': 1,
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@ -17,7 +17,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""

    json_file.seek(0)
-    links = json.load(json_file)
+
+    # sometimes the first line is a comment or filepath, so we get everything after the first {
+    json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
+    links = json.loads(json_file_json_str)
    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')

    for link in links:
--- a/archivebox/parsers/readwise_reader_api.py
+++ b/archivebox/parsers/readwise_reader_api.py
@ -0,0 +1,123 @@
+__package__ = "archivebox.parsers"
+
+
+import re
+import requests
+from datetime import datetime
+
+from typing import IO, Iterable, Optional
+from configparser import ConfigParser
+
+from pathlib import Path
+
+from ..index.schema import Link
+from ..util import enforce_types
+from ..system import atomic_write
+from ..config import (
+    SOURCES_DIR,
+    READWISE_READER_TOKENS,
+)
+
+
+API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
+
+
+class ReadwiseReaderAPI:
+    cursor: Optional[str]
+
+    def __init__(self, api_token, cursor=None) -> None:
+        self.api_token = api_token
+        self.cursor = cursor
+
+    def get_archive(self):
+        response = requests.get(
+            url="https://readwise.io/api/v3/list/",
+            headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
+            params={
+                "location": "archive",
+                "pageCursor": self.cursor,
+            }
+        )
+        response.raise_for_status()
+        return response
+
+def get_readwise_reader_articles(api: ReadwiseReaderAPI):
+    response = api.get_archive()
+    body = response.json()
+    articles = body["results"]
+
+    yield from articles
+
+
+    if body['nextPageCursor']:
+        api.cursor = body["nextPageCursor"]
+        yield from get_readwise_reader_articles(api)
+
+
+def link_from_article(article: dict, sources: list):
+    url: str = article['source_url']
+    title = article["title"] or url
+    timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
+
+    return Link(
+        url=url,
+        timestamp=str(timestamp),
+        title=title,
+        tags="",
+        sources=sources,
+    )
+
+
+def write_cursor(username: str, since: str):
+    if not API_DB_PATH.exists():
+        atomic_write(API_DB_PATH, "")
+
+    since_file = ConfigParser()
+    since_file.optionxform = str
+    since_file.read(API_DB_PATH)
+
+    since_file[username] = {"since": since}
+
+    with open(API_DB_PATH, "w+") as new:
+        since_file.write(new)
+
+
+def read_cursor(username: str) -> Optional[str]:
+    if not API_DB_PATH.exists():
+        atomic_write(API_DB_PATH, "")
+
+    config_file = ConfigParser()
+    config_file.optionxform = str
+    config_file.read(API_DB_PATH)
+
+    return config_file.get(username, "since", fallback=None)
+
+
+
+
+@enforce_types
+def should_parse_as_readwise_reader_api(text: str) -> bool:
+    return text.startswith("readwise-reader://")
+
+
+@enforce_types
+def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
+    """Parse bookmarks from the Readwise Reader API"""
+
+    input_buffer.seek(0)
+    pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
+    for line in input_buffer:
+        if should_parse_as_readwise_reader_api(line):
+            username = pattern.search(line).group(1)
+            api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
+
+            for article in get_readwise_reader_articles(api):
+                yield link_from_article(article, sources=[line])
+
+            if api.cursor:
+                write_cursor(username, api.cursor)
+
+
+KEY = "readwise_reader_api"
+NAME = "Readwise Reader API"
+PARSER = parse_readwise_reader_api_export
--- a/archivebox/templates/admin/private_index.html
+++ b/archivebox/templates/admin/private_index.html
@ -1,62 +1,3 @@
-{% extends "base.html" %}
-{% load static %}
-
-{% block body %}
-    <div id="toolbar">
-        <form id="changelist-search" action="{% url 'public-index' %}" method="get">
-            <div>
-                <label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
-                <input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
-                <input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
-                <input type="button"
-                       value="♺"
-                       title="Refresh..."
-                       onclick="location.href='{% url 'public-index' %}'"
-                       style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
-                </input>
-            </div>
-        </form>
-    </div>
-    <table id="table-bookmarks">
-        <thead>
-            <tr>
-                <th style="width: 100px;">Bookmarked</th>
-                <th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
-                <th style="width: 140px">Files</th>
-                <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
-            </tr>
-        </thead>
-            <tbody>
-                {% for link in object_list %}
-                    {% include 'main_index_row.html' with link=link  %}
-                {% endfor %}
-            </tbody>
-        </table>
-        <center>
-            <span class="step-links">
-                {% if page_obj.has_previous %}
-                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a>
-                    <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
-                {% endif %}
-        
-                <span class="current">
-                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
-                </span>
-        
-                {% if page_obj.has_next %}
-                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                    <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-                {% endif %}
-            </span>
-    
-            {% if page_obj.has_next %}
-                <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-            {% endif %}
-        </span>
-        <br>
-    </center>
-{% endblock %}
 {% extends "admin/base_site.html" %}
 {% load i18n admin_urls static admin_list %}
 {% load core_tags %}
--- a/archivebox/templates/core/add.html
+++ b/archivebox/templates/core/add.html
@ -33,7 +33,7 @@
                    <br/>
                    <div class="loader"></div>
                    <br/>
-                    Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
+                    Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
                </center>
            </div>
            <form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
            </form>
            <br/><br/><br/>
            <center id="delay-warning" style="display: none">
-                <small>(it's safe to leave this page, adding will continue in the background)</small>
+                <small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
            </center>
            {% if absolute_add_path %}
-            <center id="bookmarklet">
+            <!-- <center id="bookmarklet">
              <p>Bookmark this link to quickly add to your archive:
                <a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
-            </center>
+            </center> -->
            {% endif %}
            <script>
                document.getElementById('add-form').addEventListener('submit', function(event) {
                    document.getElementById('in-progress').style.display = 'block'
                    document.getElementById('add-form').style.display = 'none'
                    document.getElementById('delay-warning').style.display = 'block'
+                    setTimeout(function() {
+                        window.location = '/'
+                    }, 2000)
                    return true
                })
            </script>
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout

 from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
+from os.path import lexists
+from os import remove as remove_file

 try:
    import chardet
@ -59,7 +61,7 @@ URL_REGEX = re.compile(
    r'(?=('
    r'http[s]?://'                    # start matching from allowed schemes
    r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
-    r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
+    r'|[-_$@.&+!*\(\),]'           #    or allowed symbols (keep hyphen first to match literal hyphen)
    r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
    r'))',
@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
    
    return cmd_args

+def chrome_cleanup():
+    """
+    Cleans up any state or runtime files that chrome leaves behind when killed by
+    a timeout or other error
+    """
+
+    from .config import IN_DOCKER
+    
+    if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
+        remove_file("/home/archivebox/.config/chromium/SingletonLock")

 def ansi_to_html(text):
    """