Merge branch 'dev' into issue1316

2025-05-27 21:24:16 -04:00 · 2024-04-24 16:24:16 -07:00 · 2024-04-24 16:24:16 -07:00 · c6d644be29
commit c6d644be29
parent 279883d6bb 8e9cfc8869
74 changed files with 5518 additions and 1370 deletions
--- a/archivebox/api/init.py
+++ b/archivebox/api/init.py
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class APIConfig(AppConfig):
+    name = 'api'
--- a/archivebox/api/archive.py
+++ b/archivebox/api/archive.py
@ -0,0 +1,184 @@
+# archivebox_api.py
+from typing import List, Optional
+from enum import Enum
+from pydantic import BaseModel
+from ninja import Router
+from main import (
+    add,
+    remove,
+    update,
+    list_all,
+    ONLY_NEW,
+)  # Assuming these functions are defined in main.py
+
+
+# Schemas
+
+class StatusChoices(str, Enum):
+    indexed = 'indexed'
+    archived = 'archived'
+    unarchived = 'unarchived'
+    present = 'present'
+    valid = 'valid'
+    invalid = 'invalid'
+    duplicate = 'duplicate'
+    orphaned = 'orphaned'
+    corrupted = 'corrupted'
+    unrecognized = 'unrecognized'
+
+
+class AddURLSchema(BaseModel):
+    urls: List[str]
+    tag: str = ""
+    depth: int = 0
+    update: bool = not ONLY_NEW  # Default to the opposite of ONLY_NEW
+    update_all: bool = False
+    index_only: bool = False
+    overwrite: bool = False
+    init: bool = False
+    extractors: str = ""
+    parser: str = "auto"
+
+
+class RemoveURLSchema(BaseModel):
+    yes: bool = False
+    delete: bool = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    filter_type: str = "exact"
+    filter_patterns: Optional[List[str]] = None
+
+
+class UpdateSchema(BaseModel):
+    resume: Optional[float] = None
+    only_new: Optional[bool] = None
+    index_only: Optional[bool] = False
+    overwrite: Optional[bool] = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    status: Optional[StatusChoices] = None
+    filter_type: Optional[str] = 'exact'
+    filter_patterns: Optional[List[str]] = None
+    extractors: Optional[str] = ""
+
+
+class ListAllSchema(BaseModel):
+    filter_patterns: Optional[List[str]] = None
+    filter_type: str = 'exact'
+    status: Optional[StatusChoices] = None
+    after: Optional[float] = None
+    before: Optional[float] = None
+    sort: Optional[str] = None
+    csv: Optional[str] = None
+    json: bool = False
+    html: bool = False
+    with_headers: bool = False
+
+
+# API Router
+router = Router()
+
+
+@router.post("/add", response={200: dict})
+def api_add(request, payload: AddURLSchema):
+    try:
+        result = add(
+            urls=payload.urls,
+            tag=payload.tag,
+            depth=payload.depth,
+            update=payload.update,
+            update_all=payload.update_all,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            init=payload.init,
+            extractors=payload.extractors,
+            parser=payload.parser,
+        )
+        # Currently the add function returns a list of ALL items in the DB, ideally only return new items
+        return {
+            "status": "success",
+            "message": "URLs added successfully.",
+            "result": str(result),
+        }
+    except Exception as e:
+        # Handle exceptions raised by the add function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/remove", response={200: dict})
+def api_remove(request, payload: RemoveURLSchema):
+    try:
+        result = remove(
+            yes=payload.yes,
+            delete=payload.delete,
+            before=payload.before,
+            after=payload.after,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+        )
+        return {
+            "status": "success",
+            "message": "URLs removed successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the remove function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/update", response={200: dict})
+def api_update(request, payload: UpdateSchema):
+    try:
+        result = update(
+            resume=payload.resume,
+            only_new=payload.only_new,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            before=payload.before,
+            after=payload.after,
+            status=payload.status,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+            extractors=payload.extractors,
+        )
+        return {
+            "status": "success",
+            "message": "Archive updated successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the update function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/list_all", response={200: dict})
+def api_list_all(request, payload: ListAllSchema):
+    try:
+        result = list_all(
+            filter_patterns=payload.filter_patterns,
+            filter_type=payload.filter_type,
+            status=payload.status,
+            after=payload.after,
+            before=payload.before,
+            sort=payload.sort,
+            csv=payload.csv,
+            json=payload.json,
+            html=payload.html,
+            with_headers=payload.with_headers,
+        )
+        # TODO: This is kind of bad, make the format a choice field
+        if payload.json:
+            return {"status": "success", "format": "json", "data": result}
+        elif payload.html:
+            return {"status": "success", "format": "html", "data": result}
+        elif payload.csv:
+            return {"status": "success", "format": "csv", "data": result}
+        else:
+            return {
+                "status": "success",
+                "message": "List generated successfully.",
+                "data": result,
+            }
+    except Exception as e:
+        # Handle exceptions raised by the list_all function or during processing
+        return {"status": "error", "message": str(e)}
--- a/archivebox/api/auth.py
+++ b/archivebox/api/auth.py
@ -0,0 +1,48 @@
+from django.contrib.auth import authenticate
+from ninja import Form, Router, Schema
+from ninja.security import HttpBearer
+
+from api.models import Token
+
+router = Router()
+
+
+class GlobalAuth(HttpBearer):
+    def authenticate(self, request, token):
+        try:
+            return Token.objects.get(token=token).user
+        except Token.DoesNotExist:
+            pass
+
+
+class AuthSchema(Schema):
+    email: str
+    password: str
+
+
+@router.post("/authenticate", auth=None)  # overriding global auth
+def get_token(request, auth_data: AuthSchema):
+    user = authenticate(username=auth_data.email, password=auth_data.password)
+    if user:
+        # Assuming a user can have multiple tokens and you want to create a new one every time
+        new_token = Token.objects.create(user=user)
+        return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
+    else:
+        return {"error": "Invalid credentials"}
+
+
+class TokenValidationSchema(Schema):
+    token: str
+
+
+@router.post("/validate_token", auth=None) # No authentication required for this endpoint
+def validate_token(request, token_data: TokenValidationSchema):
+    try:
+        # Attempt to authenticate using the provided token
+        user = GlobalAuth().authenticate(request, token_data.token)
+        if user:
+            return {"status": "valid"}
+        else:
+            return {"status": "invalid"}
+    except Token.DoesNotExist:
+        return {"status": "invalid"}
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@ -0,0 +1,28 @@
+# Generated by Django 3.1.14 on 2024-04-09 18:52
+
+import api.models
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Token',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('expiry', models.DateTimeField(blank=True, null=True)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
+            ],
+        ),
+    ]
--- a/archivebox/api/migrations/init.py
+++ b/archivebox/api/migrations/init.py
--- a/archivebox/api/models.py
+++ b/archivebox/api/models.py
@ -0,0 +1,30 @@
+import uuid
+from datetime import timedelta
+
+from django.conf import settings
+from django.db import models
+from django.utils import timezone
+from django.utils.translation import gettext_lazy as _
+
+def hex_uuid():
+    return uuid.uuid4().hex
+
+
+class Token(models.Model):
+    user = models.ForeignKey(
+        settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
+    )
+    token = models.CharField(max_length=32, default=hex_uuid, unique=True)
+    created = models.DateTimeField(auto_now_add=True)
+    expiry = models.DateTimeField(null=True, blank=True)
+
+    @property
+    def expiry_as_iso8601(self):
+        """Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
+        expiry_date = (
+            self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
+        )
+        return expiry_date.isoformat()
+
+    def __str__(self):
+        return self.token
--- a/archivebox/api/tests.py
+++ b/archivebox/api/tests.py
@ -0,0 +1,27 @@
+from django.test import TestCase
+from ninja.testing import TestClient
+from archivebox.api.archive import router as archive_router
+
+class ArchiveBoxAPITestCase(TestCase):
+    def setUp(self):
+        self.client = TestClient(archive_router)
+
+    def test_add_endpoint(self):
+        response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_remove_endpoint(self):
+        response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_update_endpoint(self):
+        response = self.client.post("/update", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_list_all_endpoint(self):
+        response = self.client.post("/list_all", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertTrue("success" in response.json()["status"])
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -112,6 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'LDAP_FIRSTNAME_ATTR':       {'type': str,   'default': None},
        'LDAP_LASTNAME_ATTR':        {'type': str,   'default': None},
        'LDAP_EMAIL_ATTR':           {'type': str,   'default': None},
+        'LDAP_CREATE_SUPERUSER':      {'type': bool,  'default': False},
    },

    'ARCHIVE_METHOD_TOGGLES': {
@ -136,14 +137,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
    },

    'ARCHIVE_METHOD_OPTIONS': {
-        'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
+        'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
+        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
        'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
        'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},

-        'CURL_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
-        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
+        'USER_AGENT':               {'type': str,   'default': None},
+        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
+        'CHROME_USER_AGENT':        {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},

        'COOKIES_FILE':             {'type': str,   'default': None},
        'CHROME_USER_DATA_DIR':     {'type': str,   'default': None},
@ -151,7 +153,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'CHROME_TIMEOUT':           {'type': int,   'default': 0},
        'CHROME_HEADLESS':          {'type': bool,  'default': True},
        'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
+        'CHROME_EXTRA_ARGS':        {'type': list,  'default': None},
+
        'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: [
+                                                                '--restrict-filenames',
+                                                                '--trim-filenames', '128',
                                                                '--write-description',
                                                                '--write-info-json',
                                                                '--write-annotations',
@ -173,6 +179,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--add-metadata',
                                                                '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
                                                                ]},
+        'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},


        'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
@ -184,12 +191,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--no-parent',
                                                                '-e', 'robots=off',
                                                                ]},
+        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
        'CURL_ARGS':                {'type': list,  'default': ['--silent',
                                                                '--location',
                                                                '--compressed'
                                                               ]},
+        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default' : None},
+        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
+        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
+        'MERCURY_ARGS':             {'type': list,  'default': ['--format=text']},
+        'MERCURY_EXTRA_ARGS':       {'type': list,  'default': None},
        'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
    },

@ -269,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
 ARCHIVE_DIR_NAME = 'archive'
 SOURCES_DIR_NAME = 'sources'
 LOGS_DIR_NAME = 'logs'
+PERSONAS_DIR_NAME = 'personas'
 SQL_INDEX_FILENAME = 'index.sqlite3'
 JSON_INDEX_FILENAME = 'index.json'
 HTML_INDEX_FILENAME = 'index.html'
@ -342,9 +355,11 @@ ALLOWED_IN_OUTPUT_DIR = {
    'static',
    'sonic',
    'search.sqlite3',
+    'crontabs',
    ARCHIVE_DIR_NAME,
    SOURCES_DIR_NAME,
    LOGS_DIR_NAME,
+    PERSONAS_DIR_NAME,
    SQL_INDEX_FILENAME,
    f'{SQL_INDEX_FILENAME}-wal',
    f'{SQL_INDEX_FILENAME}-shm',
@ -363,24 +378,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE

 ############################## Version Config ##################################

-def get_system_user():
-    SYSTEM_USER = getpass.getuser() or os.getlogin()
+def get_system_user() -> str:
+    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
+    # uid 999 is especially problematic and breaks many attempts
+    SYSTEM_USER = None
+    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
+
+    # Option 1
    try:
        import pwd
-        return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
-    except KeyError:
-        # Process' UID might not map to a user in cases such as running the Docker image
-        # (where `archivebox` is 999) as a different UID.
-        pass
-    except ModuleNotFoundError:
-        # pwd doesn't exist on windows
-        pass
-    except Exception:
-        # this should never happen, uncomment to debug
-        # raise
+        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
+    except (ModuleNotFoundError, Exception):
        pass

-    return SYSTEM_USER
+    # Option 2
+    try:
+        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
+    except Exception:
+        pass
+
+    # Option 3
+    try:
+        SYSTEM_USER = SYSTEM_USER or os.getlogin()
+    except Exception:
+        pass
+
+    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER

 def get_version(config):
    try:
@ -487,9 +510,10 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
    'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
    'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
+    'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
    'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
-    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
+    'CHROME_USER_DATA_DIR':     {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
    'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
    'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
    'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},  # exec is always needed to list directories
@ -519,6 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
    'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
    'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
+    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},

@ -529,18 +554,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
    'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
    'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
+    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},

    'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},

    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
    'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
+    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},

    'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},

    'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
    'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
+    'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
+    'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},

    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -550,6 +579,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'YOUTUBEDL_VERSION':        {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
    'SAVE_MEDIA':               {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
    'YOUTUBEDL_ARGS':           {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
+    'YOUTUBEDL_EXTRA_ARGS':     {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},

    'CHROME_BINARY':            {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
    'USE_CHROME':               {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
@ -571,6 +601,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
    'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
    'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
+    'CHROME_EXTRA_ARGS':        {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
    'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
    'SAVE_DENYLIST_PTN':        {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
 }
@ -899,27 +930,36 @@ def find_chrome_binary() -> Optional[str]:

 def find_chrome_data_dir() -> Optional[str]:
    """find any installed chrome user data directories in the default locations"""
-    # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
-    # make sure data dir finding precedence order always matches binary finding order
-    default_profile_paths = (
-        '~/.config/chromium',
-        '~/Library/Application Support/Chromium',
-        '~/AppData/Local/Chromium/User Data',
-        '~/.config/chrome',
-        '~/.config/google-chrome',
-        '~/Library/Application Support/Google/Chrome',
-        '~/AppData/Local/Google/Chrome/User Data',
-        '~/.config/google-chrome-stable',
-        '~/.config/google-chrome-beta',
-        '~/Library/Application Support/Google/Chrome Canary',
-        '~/AppData/Local/Google/Chrome SxS/User Data',
-        '~/.config/google-chrome-unstable',
-        '~/.config/google-chrome-dev',
-    )
-    for path in default_profile_paths:
-        full_path = Path(path).resolve()
-        if full_path.exists():
-            return full_path
+    # deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
+
+    # Going forward we want to discourage people from using their main chrome profile for archiving.
+    # Session tokens, personal data, and cookies are often returned in server responses,
+    # when they get archived, they are essentially burned as anyone who can view the archive
+    # can use that data to masquerade as the logged-in user that did the archiving.
+    # For this reason users should always create dedicated burner profiles for archiving and not use
+    # their daily driver main accounts.
+
+    # # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
+    # # make sure data dir finding precedence order always matches binary finding order
+    # default_profile_paths = (
+    #     '~/.config/chromium',
+    #     '~/Library/Application Support/Chromium',
+    #     '~/AppData/Local/Chromium/User Data',
+    #     '~/.config/chrome',
+    #     '~/.config/google-chrome',
+    #     '~/Library/Application Support/Google/Chrome',
+    #     '~/AppData/Local/Google/Chrome/User Data',
+    #     '~/.config/google-chrome-stable',
+    #     '~/.config/google-chrome-beta',
+    #     '~/Library/Application Support/Google/Chrome Canary',
+    #     '~/AppData/Local/Google/Chrome SxS/User Data',
+    #     '~/.config/google-chrome-unstable',
+    #     '~/.config/google-chrome-dev',
+    # )
+    # for path in default_profile_paths:
+    #     full_path = Path(path).resolve()
+    #     if full_path.exists():
+    #         return full_path
    return None

 def wget_supports_compression(config):
@ -990,6 +1030,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
            'enabled': True,
            'is_valid': config['LOGS_DIR'].exists(),
        },
+        'PERSONAS_DIR': {
+            'path': config['PERSONAS_DIR'].resolve(),
+            'enabled': True,
+            'is_valid': config['PERSONAS_DIR'].exists(),
+        },
        'ARCHIVE_DIR': {
            'path': config['ARCHIVE_DIR'].resolve(),
            'enabled': True,
@ -1337,6 +1382,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO

    (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
    (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
+    (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
+    (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)



--- a/archivebox/core/init.py
+++ b/archivebox/core/init.py
@ -1 +1,2 @@
 __package__ = 'archivebox.core'
+
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -24,8 +24,16 @@ from core.mixins import SearchResultsAdminMixin
 from index.html import snapshot_icons
 from logging_util import printable_filesize
 from main import add, remove
-from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
 from extractors import archive_links
+from config import (
+    OUTPUT_DIR,
+    SNAPSHOTS_PER_PAGE,
+    VERSION,
+    VERSIONS_AVAILABLE,
+    CAN_UPGRADE
+)
+
+GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}

 # Admin URLs
 # /admin/
@ -40,6 +48,60 @@ from extractors import archive_links
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel


+class ArchiveBoxAdmin(admin.AdminSite):
+    site_header = 'ArchiveBox'
+    index_title = 'Links'
+    site_title = 'Index'
+    namespace = 'admin'
+
+    def get_urls(self):
+        return [
+            path('core/snapshot/add/', self.add_view, name='Add'),
+        ] + super().get_urls()
+
+    def add_view(self, request):
+        if not request.user.is_authenticated:
+            return redirect(f'/admin/login/?next={request.path}')
+
+        request.current_app = self.name
+        context = {
+            **self.each_context(request),
+            'title': 'Add URLs',
+        }
+
+        if request.method == 'GET':
+            context['form'] = AddLinkForm()
+
+        elif request.method == 'POST':
+            form = AddLinkForm(request.POST)
+            if form.is_valid():
+                url = form.cleaned_data["url"]
+                print(f'[+] Adding URL: {url}')
+                depth = 0 if form.cleaned_data["depth"] == "0" else 1
+                input_kwargs = {
+                    "urls": url,
+                    "depth": depth,
+                    "update_all": False,
+                    "out_dir": OUTPUT_DIR,
+                }
+                add_stdout = StringIO()
+                with redirect_stdout(add_stdout):
+                   add(**input_kwargs)
+                print(add_stdout.getvalue())
+
+                context.update({
+                    "stdout": ansi_to_html(add_stdout.getvalue().strip()),
+                    "form": AddLinkForm()
+                })
+            else:
+                context["form"] = form
+
+        return render(template_name='add.html', request=request, context=context)
+
+archivebox_admin = ArchiveBoxAdmin()
+archivebox_admin.register(get_user_model())
+archivebox_admin.disable_action('delete_selected')
+
 class ArchiveResultInline(admin.TabularInline):
    model = ArchiveResult

@ -49,11 +111,11 @@ class TagInline(admin.TabularInline):
 from django.contrib.admin.helpers import ActionForm
 from django.contrib.admin.widgets import AutocompleteSelectMultiple

-# WIP: broken by Django 3.1.2 -> 4.0 migration
 class AutocompleteTags:
    model = Tag
    search_fields = ['name']
    name = 'tags'
+    remote_field = TagInline

 class AutocompleteTagsAdminStub:
    name = 'admin'
@ -63,7 +125,6 @@ class SnapshotActionForm(ActionForm):
    tags = forms.ModelMultipleChoiceField(
        queryset=Tag.objects.all(),
        required=False,
-        # WIP: broken by Django 3.1.2 -> 4.0 migration
        widget=AutocompleteSelectMultiple(
            AutocompleteTags(),
            AutocompleteTagsAdminStub(),
@ -82,6 +143,7 @@ class SnapshotActionForm(ActionForm):
    # )


+@admin.register(Snapshot, site=archivebox_admin)
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
    list_display = ('added', 'title_str', 'files', 'size', 'url_str')
    sort_fields = ('title_str', 'url_str', 'added', 'files')
@ -97,6 +159,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):

    action_form = SnapshotActionForm

+    def changelist_view(self, request, extra_context=None):
+        extra_context = extra_context or {}
+        return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
+
    def get_urls(self):
        urls = super().get_urls()
        custom_urls = [
@ -164,6 +230,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
            obj.id,
        )

+    @admin.display(
+        description='Title',
+        ordering='title',
+    )
    def title_str(self, obj):
        canon = obj.as_link().canonical_outputs()
        tags = ''.join(
@ -185,12 +255,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
            urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
        ) + mark_safe(f' <span class="tags">{tags}</span>')

+    @admin.display(
+        description='Files Saved',
+        ordering='archiveresult_count',
+    )
    def files(self, obj):
        return snapshot_icons(obj)

-    files.admin_order_field = 'archiveresult_count'
-    files.short_description = 'Files Saved'

+    @admin.display(
+        ordering='archiveresult_count'
+    )
    def size(self, obj):
        archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
        if archive_size:
@ -205,8 +280,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
            size_txt,
        )

-    size.admin_order_field = 'archiveresult_count'

+    @admin.display(
+        description='Original URL',
+        ordering='url',
+    )
    def url_str(self, obj):
        return format_html(
            '<a href="{}"><code style="user-select: all;">{}</code></a>',
@ -243,65 +321,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
    #     print('[*] Got request', request.method, request.POST)
    #     return super().changelist_view(request, extra_context=None)

+    @admin.action(
+        description="Pull"
+    )
    def update_snapshots(self, request, queryset):
        archive_links([
            snapshot.as_link()
            for snapshot in queryset
        ], out_dir=OUTPUT_DIR)
-    update_snapshots.short_description = "Pull"

+    @admin.action(
+        description="⬇️ Title"
+    )
    def update_titles(self, request, queryset):
        archive_links([
            snapshot.as_link()
            for snapshot in queryset
        ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
-    update_titles.short_description = "⬇️ Title"

+    @admin.action(
+        description="Re-Snapshot"
+    )
    def resnapshot_snapshot(self, request, queryset):
        for snapshot in queryset:
            timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
            new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
            add(new_url, tag=snapshot.tags_str())
-    resnapshot_snapshot.short_description = "Re-Snapshot"

+    @admin.action(
+        description="Reset"
+    )
    def overwrite_snapshots(self, request, queryset):
        archive_links([
            snapshot.as_link()
            for snapshot in queryset
        ], overwrite=True, out_dir=OUTPUT_DIR)
-    overwrite_snapshots.short_description = "Reset"

+    @admin.action(
+        description="Delete"
+    )
    def delete_snapshots(self, request, queryset):
        remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)

-    delete_snapshots.short_description = "Delete"

+    @admin.action(
+        description="+"
+    )
    def add_tags(self, request, queryset):
        tags = request.POST.getlist('tags')
        print('[+] Adding tags', tags, 'to Snapshots', queryset)
        for obj in queryset:
            obj.tags.add(*tags)

-    add_tags.short_description = "+"

+    @admin.action(
+        description="–"
+    )
    def remove_tags(self, request, queryset):
        tags = request.POST.getlist('tags')
        print('[-] Removing tags', tags, 'to Snapshots', queryset)
        for obj in queryset:
            obj.tags.remove(*tags)

-    remove_tags.short_description = "–"

        

-    title_str.short_description = 'Title'
-    url_str.short_description = 'Original URL'
-
-    title_str.admin_order_field = 'title'
-    url_str.admin_order_field = 'url'



+
+@admin.register(Tag, site=archivebox_admin)
 class TagAdmin(admin.ModelAdmin):
    list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
    sort_fields = ('id', 'name', 'slug')
@ -332,6 +421,7 @@ class TagAdmin(admin.ModelAdmin):
        ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))


+@admin.register(ArchiveResult, site=archivebox_admin)
 class ArchiveResultAdmin(admin.ModelAdmin):
    list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
    sort_fields = ('start_ts', 'extractor', 'status')
@ -344,6 +434,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
    ordering = ['-start_ts']
    list_per_page = SNAPSHOTS_PER_PAGE

+    @admin.display(
+        description='snapshot'
+    )
    def snapshot_str(self, obj):
        return format_html(
            '<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
@ -353,6 +446,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
            obj.snapshot.url[:128],
        )

+    @admin.display(
+        description='tags'
+    )
    def tags_str(self, obj):
        return obj.snapshot.tags_str()

@ -369,62 +465,3 @@ class ArchiveResultAdmin(admin.ModelAdmin):
            obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
            obj.output,
        )
-
-    tags_str.short_description = 'tags'
-    snapshot_str.short_description = 'snapshot'
-
-class ArchiveBoxAdmin(admin.AdminSite):
-    site_header = 'ArchiveBox'
-    index_title = 'Links'
-    site_title = 'Index'
-
-    def get_urls(self):
-        return [
-            path('core/snapshot/add/', self.add_view, name='Add'),
-        ] + super().get_urls()
-
-    def add_view(self, request):
-        if not request.user.is_authenticated:
-            return redirect(f'/admin/login/?next={request.path}')
-
-        request.current_app = self.name
-        context = {
-            **self.each_context(request),
-            'title': 'Add URLs',
-        }
-
-        if request.method == 'GET':
-            context['form'] = AddLinkForm()
-
-        elif request.method == 'POST':
-            form = AddLinkForm(request.POST)
-            if form.is_valid():
-                url = form.cleaned_data["url"]
-                print(f'[+] Adding URL: {url}')
-                depth = 0 if form.cleaned_data["depth"] == "0" else 1
-                input_kwargs = {
-                    "urls": url,
-                    "depth": depth,
-                    "update_all": False,
-                    "out_dir": OUTPUT_DIR,
-                }
-                add_stdout = StringIO()
-                with redirect_stdout(add_stdout):
-                   add(**input_kwargs)
-                print(add_stdout.getvalue())
-
-                context.update({
-                    "stdout": ansi_to_html(add_stdout.getvalue().strip()),
-                    "form": AddLinkForm()
-                })
-            else:
-                context["form"] = form
-
-        return render(template_name='add.html', request=request, context=context)
-
-admin.site = ArchiveBoxAdmin()
-admin.site.register(get_user_model())
-admin.site.register(Snapshot, SnapshotAdmin)
-admin.site.register(Tag, TagAdmin)
-admin.site.register(ArchiveResult, ArchiveResultAdmin)
-admin.site.disable_action('delete_selected')
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@ -3,5 +3,8 @@ from django.apps import AppConfig

 class CoreConfig(AppConfig):
    name = 'core'
-    # WIP: broken by Django 3.1.2 -> 4.0 migration
-    default_auto_field = 'django.db.models.UUIDField'
+
+    def ready(self):
+        from .auth import register_signals
+
+        register_signals()
--- a/archivebox/core/auth.py
+++ b/archivebox/core/auth.py
@ -0,0 +1,13 @@
+import os
+from django.conf import settings
+from ..config import (
+    LDAP
+)
+
+def register_signals():
+
+    if LDAP:
+        import django_auth_ldap.backend
+        from .auth_ldap import create_user
+
+        django_auth_ldap.backend.populate_user.connect(create_user)
--- a/archivebox/core/auth_ldap.py
+++ b/archivebox/core/auth_ldap.py
@ -0,0 +1,12 @@
+from django.conf import settings
+from ..config import (
+    LDAP_CREATE_SUPERUSER
+)
+
+def create_user(sender, user=None, ldap_user=None, **kwargs):
+
+    if not user.id and LDAP_CREATE_SUPERUSER:
+        user.is_superuser = True
+
+    user.is_staff = True
+    print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -61,6 +61,7 @@ INSTALLED_APPS = [
    'django.contrib.admin',

    'core',
+    'api',

    'django_extensions',
 ]
@ -269,9 +270,6 @@ AUTH_PASSWORD_VALIDATORS = [
    {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
 ]

-# WIP: broken by Django 3.1.2 -> 4.0 migration
-DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
-
 ################################################################################
 ### Shell Settings
 ################################################################################
@ -290,7 +288,6 @@ if IS_SHELL:

 LANGUAGE_CODE = 'en-us'
 USE_I18N = True
-USE_L10N = True
 USE_TZ = True
 DATETIME_FORMAT = 'Y-m-d g:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@ -1,4 +1,4 @@
-from django.contrib import admin
+from .admin import archivebox_admin

 from django.urls import path, include
 from django.views import static
@ -8,6 +8,13 @@ from django.views.generic.base import RedirectView

 from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView

+from ninja import NinjaAPI
+from api.auth import GlobalAuth
+
+api = NinjaAPI(auth=GlobalAuth())
+api.add_router("/auth/", "api.auth.router")
+api.add_router("/archive/", "api.archive.router")
+
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
@ -34,10 +41,9 @@ urlpatterns = [


    path('accounts/', include('django.contrib.auth.urls')),
-    path('admin/', admin.site.urls),
+    path('admin/', archivebox_admin.urls),
    
-    # do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs)
-    # path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}),
+    path("api/", api.urls),

    path('health/', HealthCheckView.as_view(), name='healthcheck'),
    path('error/', lambda _: 1/0),
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s

        link = load_link_details(link, out_dir=out_dir)
        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
-        log_link_archiving_started(link, out_dir, is_new)
+        log_link_archiving_started(link, str(out_dir), is_new)
        link = link.overwrite(updated=datetime.now(timezone.utc))
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
        start_ts = datetime.now(timezone.utc)
@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                    stats['skipped'] += 1
            except Exception as e:
-                # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
-                # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
-                # are fixed.
-                """
-                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
-                    method_name,
-                    link.url,
-                )) from e
-                """
-                # Instead, use the kludgy workaround from
                # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
                with open(ERROR_LOG, "a", encoding='utf-8') as f:
                    command = ' '.join(sys.argv)
@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                        ts
                    ) + "\n" + str(e) + "\n"))
                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+               
+                # print(f'        ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
+                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
+                    method_name,
+                    link.url,
+                )) from e
+

        # print('    ', stats)

@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa

    if type(all_links) is QuerySet:
        num_links: int = all_links.count()
-        get_link = lambda x: x.as_link()
+        get_link = lambda x: x.as_link_with_details()
        all_links = all_links.iterator()
    else:
        num_links: int = len(all_links)
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@ -10,10 +10,12 @@ from ..system import run, chmod_file
 from ..util import (
    enforce_types,
    is_static_file,
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
    CURL_ARGS,
+    CURL_EXTRA_ARGS,
    CHECK_SSL_VALIDITY,
    SAVE_ARCHIVE_DOT_ORG,
    CURL_BINARY,
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
    output: ArchiveOutput = 'archive.org.txt'
    archive_org_url = None
    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    cmd = [
-        CURL_BINARY,
+    # later options take precedence
+    options = [
        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
        '--head',
        '--max-time', str(timeout),
        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(options),
        submit_url,
    ]
    status = 'succeeded'
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@ -6,13 +6,18 @@ from typing import Optional

 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..system import chmod_file, run
-from ..util import enforce_types, domain
+from ..util import (
+    enforce_types,
+     domain,
+     dedupe,
+)
 from ..config import (
    TIMEOUT,
    SAVE_FAVICON,
    FAVICON_PROVIDER,
    CURL_BINARY,
    CURL_ARGS,
+    CURL_EXTRA_ARGS,
    CURL_VERSION,
    CHECK_SSL_VALIDITY,
    CURL_USER_AGENT,
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)

    out_dir = out_dir or link.link_dir
    output: ArchiveOutput = 'favicon.ico'
-    cmd = [
-        CURL_BINARY,
+    # later options take precedence
+    options = [
        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
        '--max-time', str(timeout),
        '--output', str(output),
        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(options),
        FAVICON_PROVIDER.format(domain(link.url)),
    ]
    status = 'failed'
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@ -9,11 +9,13 @@ from ..system import atomic_write
 from ..util import (
    enforce_types,
    get_headers,
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
    CURL_BINARY,
    CURL_ARGS,
+    CURL_EXTRA_ARGS,
    CURL_USER_AGENT,
    CURL_VERSION,
    CHECK_SSL_VALIDITY,
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
-
-    cmd = [
-        CURL_BINARY,
+    # later options take precedence
+    options = [
        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
        '--head',
        '--max-time', str(timeout),
        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(options),
        link.url,
    ]
    try:
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

    out_dir = Path(out_dir or link.link_dir)
    output = "htmltotext.txt"
+    cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']

    timer = TimedProgress(timeout, prefix='      ')
    extracted_text = None
+    status = 'failed'
    try:
        extractor = HTMLTextExtractor()
        document = get_html(link, out_dir)
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        extracted_text = str(extractor)

        atomic_write(str(out_dir / output), extracted_text)
+        status = 'succeeded'
    except (Exception, OSError) as err:
-        status = 'failed'
        output = err
-        cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
    finally:
        timer.end()

--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@ -8,11 +8,13 @@ from ..system import run, chmod_file
 from ..util import (
    enforce_types,
    is_static_file,
+    dedupe,
 )
 from ..config import (
    MEDIA_TIMEOUT,
    SAVE_MEDIA,
    YOUTUBEDL_ARGS,
+    YOUTUBEDL_EXTRA_ARGS,
    YOUTUBEDL_BINARY,
    YOUTUBEDL_VERSION,
    CHECK_SSL_VALIDITY
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
    output: ArchiveOutput = 'media'
    output_path = out_dir / output
    output_path.mkdir(exist_ok=True)
-    cmd = [
-        YOUTUBEDL_BINARY,
+    # later options take precedence
+    options = [
        *YOUTUBEDL_ARGS,
+        *YOUTUBEDL_EXTRA_ARGS,
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
        # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
+    ]
+    cmd = [
+        YOUTUBEDL_BINARY,
+        *dedupe(options),
        link.url,
    ]
    status = 'succeeded'
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@ -11,13 +11,15 @@ from ..system import run, atomic_write
 from ..util import (
    enforce_types,
    is_static_file,
-
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
    SAVE_MERCURY,
    DEPENDENCIES,
    MERCURY_VERSION,
+    MERCURY_ARGS,
+    MERCURY_EXTRA_ARGS,
 )
 from ..logging_util import TimedProgress

@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
    timer = TimedProgress(timeout, prefix='      ')
    try:
        output_folder.mkdir(exist_ok=True)
-
-        # Get plain text version of article
+        # later options take precedence
+        options = [
+            *MERCURY_ARGS,
+            *MERCURY_EXTRA_ARGS,
+        ]
+        # By default, get plain text version of article
        cmd = [
            DEPENDENCIES['MERCURY_BINARY']['path'],
            link.url,
-            "--format=text"
+            *dedupe(options)
        ]
        result = run(cmd, cwd=out_dir, timeout=timeout)
        try:
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@ -11,6 +11,7 @@ from ..util import (
    enforce_types,
    is_static_file,
    chrome_args,
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
@ -18,7 +19,9 @@ from ..config import (
    DEPENDENCIES,
    SINGLEFILE_VERSION,
    SINGLEFILE_ARGS,
+    SINGLEFILE_EXTRA_ARGS,
    CHROME_BINARY,
+    COOKIES_FILE,
 )
 from ..logging_util import TimedProgress

@ -46,37 +49,24 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

    # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
    browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
+    # later options take precedence
    options = [
-        *SINGLEFILE_ARGS,
        '--browser-executable-path={}'.format(CHROME_BINARY),
+        *(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
        browser_args,
+        *SINGLEFILE_ARGS,
+        *SINGLEFILE_EXTRA_ARGS,
    ]
-
-    # Deduplicate options (single-file doesn't like when you use the same option two times)
-    #
-    # NOTE: Options names that come first clobber conflicting names that come later
-    # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
-    # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
-    # kind of like the ergonomic principle of lexical scope in programming languages.
-    seen_option_names = []
-    def test_seen(argument):
-        option_name = argument.split("=")[0]
-        if option_name in seen_option_names:
-            return False
-        else:
-            seen_option_names.append(option_name)
-            return True
-    deduped_options = list(filter(test_seen, options))
-
    cmd = [
        DEPENDENCIES['SINGLEFILE_BINARY']['path'],
-        *deduped_options,
+        *dedupe(options),
        link.url,
        output,
    ]

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
+    result = None
    try:
        result = run(cmd, cwd=str(out_dir), timeout=timeout)

@ -84,7 +74,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
            if line.strip()
        ]
        hints = (
@ -94,12 +84,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

        # Check for common failure cases
        if (result.returncode > 0) or not (out_dir / output).is_file():
-            raise ArchiveError('SingleFile was not able to archive the page', hints)
+            raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
        chmod_file(output, cwd=str(out_dir))
    except (Exception, OSError) as err:
        status = 'failed'
        # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
        cmd[2] = browser_args.replace('"', "\\\"")
+        err.hints = (result.stdout + result.stderr).decode().split('\n')
        output = err
    finally:
        timer.end()
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -10,6 +10,7 @@ from ..util import (
    enforce_types,
    download_url,
    htmldecode,
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
@ -17,6 +18,7 @@ from ..config import (
    SAVE_TITLE,
    CURL_BINARY,
    CURL_ARGS,
+    CURL_EXTRA_ARGS,
    CURL_VERSION,
    CURL_USER_AGENT,
 )
@ -75,7 +77,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
            with open(abs_path / source, "r", encoding="utf-8") as f:
                document = f.read()
                break
-        except (FileNotFoundError, TypeError):
+        except (FileNotFoundError, TypeError, UnicodeDecodeError):
            continue
    if document is None:
        return download_url(link.url, timeout=timeout)
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
    from core.models import Snapshot

    output: ArchiveOutput = None
-    cmd = [
-        CURL_BINARY,
+    # later options take precedence
+    options = [
        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
        '--max-time', str(timeout),
        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(options),
        link.url,
    ]
    status = 'succeeded'
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -15,9 +15,11 @@ from ..util import (
    path,
    domain,
    urldecode,
+    dedupe,
 )
 from ..config import (
    WGET_ARGS,
+    WGET_EXTRA_ARGS,
    TIMEOUT,
    SAVE_WGET,
    SAVE_WARC,
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->

    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
    output: ArchiveOutput = None
-    cmd = [
-        WGET_BINARY,
-        # '--server-response',  # print headers for better error parsing
+    # later options take precedence
+    options = [
        *WGET_ARGS,
+        *WGET_EXTRA_ARGS,
        '--timeout={}'.format(timeout),
        *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
        *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
        *([] if SAVE_WARC else ['--timestamping']),
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        # '--server-response',  # print headers for better error parsing
+    ]
+    cmd = [
+        WGET_BINARY,
+        *dedupe(options),
        link.url,
    ]

--- a/archivebox/index.sqlite3
+++ b/archivebox/index.sqlite3
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
    """parse and load existing index with any new links from import_path merged in"""
    from core.models import Snapshot
    try:
-        return Snapshot.objects.all()
+        return Snapshot.objects.all().only('id')

    except (KeyboardInterrupt, SystemExit):
        raise SystemExit(0)
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type

 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option

 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are archived with a valid data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio

 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_unarchived, links)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
                    **ANSI,
                ),
            ]
+        
+        # import pudb; pudb.set_trace()

        # Prettify error output hints string and limit to five lines
        hints = getattr(result.output, 'hints', None) or ()
        if hints:
            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
-                hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
+                hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
            else:
                if isinstance(hints, bytes):
                    hints = hints.decode()
@ -492,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
    if delete:
        file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
        print(
-            f'    {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
+            f'    {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
            f'    ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
        )
    else:
        print(
-            '    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
+            '    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
            '    (Pass --delete if you also want to permanently delete the data folders)'
        )

@ -636,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:

@enforce_types
 def printable_dependency_version(name: str, dependency: Dict) -> str:
-    version = None
+    color, symbol, note, version = 'red', 'X', 'invalid', '?'
+
    if dependency['enabled']:
        if dependency['is_valid']:
-            color, symbol, note, version = 'green', '√', 'valid', ''
+            color, symbol, note = 'green', '√', 'valid'

            parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
            if parsed_version_num:
                version = f'v{parsed_version_num[0]}'
-
-        if not version:
-            color, symbol, note, version = 'red', 'X', 'invalid', '?'
    else:
        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
           out_dir: Path=OUTPUT_DIR) -> List[Link]:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""

+    from core.models import ArchiveResult
+
    check_data_folder(out_dir=out_dir)
    check_dependencies()
    new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
    extractors = extractors.split(",") if extractors else []

    # Step 1: Filter for selected_links
+    print('[*] Finding matching Snapshots to update...')
+    print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
    matching_snapshots = list_links(
        filter_patterns=filter_patterns,
        filter_type=filter_type,
        before=before,
        after=after,
    )
-
+    print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
    matching_folders = list_folders(
        links=matching_snapshots,
        status=status,
        out_dir=out_dir,
    )
-    all_links = [link for link in matching_folders.values() if link]
+    all_links = (link for link in matching_folders.values() if link)
+    print('    - Sorting by most unfinished -> least unfinished + date archived...')
+    all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))

    if index_only:
        for link in all_links:
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
    if extractors:
        archive_kwargs["methods"] = extractors

+
    archive_links(to_archive, overwrite=overwrite, **archive_kwargs)

    # Step 4: Re-write links index with updated titles, icons, and resources
--- a/archivebox/package-lock.json
+++ b/archivebox/package-lock.json
--- a/archivebox/package.json
+++ b/archivebox/package.json
@ -1,6 +1,6 @@
 {
  "name": "archivebox",
-  "version": "0.7.3",
+  "version": "0.8.0",
  "description": "ArchiveBox: The self-hosted internet archive",
  "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
  "repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
  "dependencies": {
    "@postlight/parser": "^2.2.3",
    "readability-extractor": "github:ArchiveBox/readability-extractor",
-    "single-file-cli": "^1.1.46"
+    "single-file-cli": "^1.1.54"
  }
 }
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -44,6 +44,7 @@ from . import medium_rss
 from . import netscape_html
 from . import generic_rss
 from . import generic_json
+from . import generic_jsonl
 from . import generic_html
 from . import generic_txt
 from . import url_list
@ -63,6 +64,7 @@ PARSERS = {
    netscape_html.KEY:  (netscape_html.NAME,    netscape_html.PARSER),
    generic_rss.KEY:    (generic_rss.NAME,      generic_rss.PARSER),
    generic_json.KEY:   (generic_json.NAME,     generic_json.PARSER),
+    generic_jsonl.KEY:  (generic_jsonl.NAME,    generic_jsonl.PARSER),
    generic_html.KEY:   (generic_html.NAME,     generic_html.PARSER),

    # Catchall fallback parser
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@ -11,6 +11,60 @@ from ..util import (
    enforce_types,
 )

+# This gets used by generic_jsonl, too
+def jsonObjectToLink(link: str, source: str):
+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
+
+    # example line
+    # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
+    # Parse URL
+    url = link.get('href') or link.get('url') or link.get('URL')
+    if not url:
+        raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
+
+    # Parse the timestamp
+    ts_str = str(datetime.now(timezone.utc).timestamp())
+    if link.get('timestamp'):
+        # chrome/ff histories use a very precise timestamp
+        ts_str = str(link['timestamp'] / 10000000)
+    elif link.get('time'):
+        ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
+    elif link.get('created_at'):
+        ts_str = str(json_date(link['created_at']).timestamp())
+    elif link.get('created'):
+        ts_str = str(json_date(link['created']).timestamp())
+    elif link.get('date'):
+        ts_str = str(json_date(link['date']).timestamp())
+    elif link.get('bookmarked'):
+        ts_str = str(json_date(link['bookmarked']).timestamp())
+    elif link.get('saved'):
+        ts_str = str(json_date(link['saved']).timestamp())
+
+    # Parse the title
+    title = None
+    if link.get('title'):
+        title = link['title'].strip()
+    elif link.get('description'):
+        title = link['description'].replace(' — Readability', '').strip()
+    elif link.get('name'):
+        title = link['name'].strip()
+
+    # if we have a list, join it with commas
+    tags = link.get('tags')
+    if type(tags) == list:
+        tags = ','.join(tags)
+    elif type(tags) == str:
+        # if there's no comma, assume it was space-separated
+        if ',' not in tags:
+            tags = tags.replace(' ', ',')
+
+    return Link(
+        url=htmldecode(url),
+        timestamp=ts_str,
+        title=htmldecode(title) or None,
+        tags=htmldecode(tags),
+        sources=[source],
+    )

@enforce_types
 def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
@ -18,55 +72,21 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:

    json_file.seek(0)

-    # sometimes the first line is a comment or filepath, so we get everything after the first {
-    json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
-    links = json.loads(json_file_json_str)
-    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
+    try:
+        links = json.load(json_file)
+        if type(links) != list:
+            raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
+    except json.decoder.JSONDecodeError:
+        # sometimes the first line is a comment or other junk, so try without
+        json_file.seek(0)
+        first_line = json_file.readline()
+        #print('      > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
+        links = json.load(json_file)
+        # we may fail again, which means we really don't know what to do

    for link in links:
-        # example line
-        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
        if link:
-            # Parse URL
-            url = link.get('href') or link.get('url') or link.get('URL')
-            if not url:
-                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
-
-            # Parse the timestamp
-            ts_str = str(datetime.now(timezone.utc).timestamp())
-            if link.get('timestamp'):
-                # chrome/ff histories use a very precise timestamp
-                ts_str = str(link['timestamp'] / 10000000)  
-            elif link.get('time'):
-                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
-            elif link.get('created_at'):
-                ts_str = str(json_date(link['created_at']).timestamp())
-            elif link.get('created'):
-                ts_str = str(json_date(link['created']).timestamp())
-            elif link.get('date'):
-                ts_str = str(json_date(link['date']).timestamp())
-            elif link.get('bookmarked'):
-                ts_str = str(json_date(link['bookmarked']).timestamp())
-            elif link.get('saved'):
-                ts_str = str(json_date(link['saved']).timestamp())
-            
-            # Parse the title
-            title = None
-            if link.get('title'):
-                title = link['title'].strip()
-            elif link.get('description'):
-                title = link['description'].replace(' — Readability', '').strip()
-            elif link.get('name'):
-                title = link['name'].strip()
-
-            yield Link(
-                url=htmldecode(url),
-                timestamp=ts_str,
-                title=htmldecode(title) or None,
-                tags=htmldecode(link.get('tags')) or '',
-                sources=[json_file.name],
-            )
-
+            yield jsonObjectToLink(link,json_file.name)

 KEY = 'json'
 NAME = 'Generic JSON'
--- a/archivebox/parsers/generic_jsonl.py
+++ b/archivebox/parsers/generic_jsonl.py
@ -0,0 +1,34 @@
+__package__ = 'archivebox.parsers'
+
+import json
+
+from typing import IO, Iterable
+from datetime import datetime, timezone
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+from .generic_json import jsonObjectToLink
+
+def parse_line(line: str):
+    if line.strip() != "":
+        return json.loads(line)
+
+@enforce_types
+def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
+    """Parse JSONL format bookmarks export files"""
+
+    json_file.seek(0)
+
+    links = [ parse_line(line) for line in json_file ]
+
+    for link in links:
+        if link:
+            yield jsonObjectToLink(link,json_file.name)
+
+KEY = 'jsonl'
+NAME = 'Generic JSONL'
+PARSER = parse_generic_jsonl_export
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'


 from typing import IO, Iterable
-from datetime import datetime
+from time import mktime
+from feedparser import parse as feedparser

 from ..index.schema import Link
 from ..util import (
    htmldecode,
-    enforce_types,
-    str_between,
+    enforce_types
 )

@enforce_types
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse RSS XML-format files into links"""

    rss_file.seek(0)
-    items = rss_file.read().split('<item>')
-    items = items[1:] if items else []
-    for item in items:
-        # example item:
-        # <item>
-        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
-        # <category>Unread</category>
-        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
-        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
-        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
-        # </item>
+    feed = feedparser(rss_file.read())
+    for item in feed.entries:
+        url = item.link
+        title = item.title
+        time = mktime(item.updated_parsed)

-        trailing_removed = item.split('</item>', 1)[0]
-        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
-        rows = leading_removed.split('\n')
+        try:
+            tags = ','.join(map(lambda tag: tag.term, item.tags))
+        except AttributeError:
+            tags = ''

-        def get_row(key):
-            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
-        url = str_between(get_row('link'), '<link>', '</link>')
-        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
+        if url is None:
+            # Yielding a Link with no URL will
+            # crash on a URL validation assertion
+            continue

        yield Link(
            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
+            timestamp=str(time),
            title=htmldecode(title) or None,
-            tags=None,
+            tags=tags,
            sources=[rss_file.name],
        )

--- a/archivebox/parsers/pinboard_rss.py
+++ b/archivebox/parsers/pinboard_rss.py
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'


 from typing import IO, Iterable
-from datetime import datetime, timezone
-
-from xml.etree import ElementTree
+from time import mktime
+from feedparser import parse as feedparser

 from ..index.schema import Link
 from ..util import (
    htmldecode,
-    enforce_types,
+    enforce_types
 )

-
@enforce_types
 def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
    """Parse Pinboard RSS feed files into links"""

    rss_file.seek(0)
-    root = ElementTree.parse(rss_file).getroot()
-    items = root.findall("{http://purl.org/rss/1.0/}item")
-    for item in items:
-        find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None    # type: ignore
+    feed = feedparser(rss_file.read())
+    for item in feed.entries:
+        url = item.link
+        # title will start with "[priv] " if pin was marked private. useful?
+        title = item.title
+        time = mktime(item.updated_parsed)

-        url = find("{http://purl.org/rss/1.0/}link")
-        tags = find("{http://purl.org/dc/elements/1.1/}subject")
-        title = find("{http://purl.org/rss/1.0/}title")
-        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+        # all tags are in one entry.tags with spaces in it. annoying!
+        try:
+            tags = item.tags[0].term.replace(' ', ',')
+        except AttributeError:
+            tags = ''
        
        if url is None:
            # Yielding a Link with no URL will
            # crash on a URL validation assertion
            continue

-        # Pinboard includes a colon in its date stamp timezone offsets, which
-        # Python can't parse. Remove it:
-        if ts_str and ts_str[-3:-2] == ":":
-            ts_str = ts_str[:-3]+ts_str[-2:]
-
-        if ts_str:
-            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-        else:
-            time = datetime.now(timezone.utc)
-
        yield Link(
            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
+            timestamp=str(time),
            title=htmldecode(title) or None,
            tags=htmldecode(tags) or None,
            sources=[rss_file.name],
--- a/archivebox/system.py
+++ b/archivebox/system.py
@ -30,8 +30,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,

    if capture_output:
        if ('stdout' in kwargs) or ('stderr' in kwargs):
-            raise ValueError('stdout and stderr arguments may not be used '
-                             'with capture_output.')
+            raise ValueError('stdout and stderr arguments may not be used with capture_output.')
        kwargs['stdout'] = PIPE
        kwargs['stderr'] = PIPE

@ -146,20 +145,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
       recursively and limiting to a given filter list
    """
    num_bytes, num_dirs, num_files = 0, 0, 0
-    for entry in os.scandir(path):
-        if (pattern is not None) and (pattern not in entry.path):
-            continue
-        if entry.is_dir(follow_symlinks=False):
-            if not recursive:
+    try:
+        for entry in os.scandir(path):
+            if (pattern is not None) and (pattern not in entry.path):
                continue
-            num_dirs += 1
-            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
-            num_bytes += bytes_inside
-            num_dirs += dirs_inside
-            num_files += files_inside
-        else:
-            num_bytes += entry.stat(follow_symlinks=False).st_size
-            num_files += 1
+            if entry.is_dir(follow_symlinks=False):
+                if not recursive:
+                    continue
+                num_dirs += 1
+                bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+                num_bytes += bytes_inside
+                num_dirs += dirs_inside
+                num_files += files_inside
+            else:
+                num_bytes += entry.stat(follow_symlinks=False).st_size
+                num_files += 1
+    except OSError:
+        # e.g. FileNameTooLong or other error while trying to read dir
+        pass
    return num_bytes, num_dirs, num_files


@ -171,7 +174,7 @@ def dedupe_cron_jobs(cron: CronTab) -> CronTab:
    deduped: Set[Tuple[str, str]] = set()

    for job in list(cron):
-        unique_tuple = (str(job.slices), job.command)
+        unique_tuple = (str(job.slices), str(job.command))
        if unique_tuple not in deduped:
            deduped.add(unique_tuple)
        cron.remove(job)
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import re
 import requests
 import json as pyjson
+import http.cookiejar

 from typing import List, Optional, Any
 from pathlib import Path
@ -56,19 +57,57 @@ short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
 ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
 ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()

+COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')

+
+# https://mathiasbynens.be/demo/url-regex
 URL_REGEX = re.compile(
-    r'(?=('
-    r'http[s]?://'                    # start matching from allowed schemes
-    r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
-    r'|[-_$@.&+!*\(\),]'           #    or allowed symbols (keep hyphen first to match literal hyphen)
-    r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
-    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
+    r'(?=('                           +
+    r'http[s]?://'                    +  # start matching from allowed schemes
+    r'(?:[a-zA-Z]|[0-9]'              +  # followed by allowed alphanum characters
+    r'|[-_$@.&+!*\(\),]'              +  #   or allowed symbols (keep hyphen first to match literal hyphen)
+    r'|[^\u0000-\u007F])+'            +  #   or allowed unicode bytes
+    r'[^\]\[<>"\'\s]+'                +  # stop parsing at these symbols
    r'))',
-    re.IGNORECASE,
+    re.IGNORECASE | re.UNICODE,
 )

-COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
+def parens_are_matched(string: str, open_char='(', close_char=')'):
+    """check that all parentheses in a string are balanced and nested properly"""
+    count = 0
+    for c in string:
+        if c == open_char:
+            count += 1
+        elif c == close_char:
+            count -= 1
+        if count < 0:
+            return False
+    return count == 0
+
+def fix_url_from_markdown(url_str: str) -> str:
+    """
+    cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
+    helpful to fix URLs parsed from markdown e.g.
+      input:  https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
+      result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
+    """
+    trimmed_url = url_str
+
+    # cut off one trailing character at a time
+    # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
+    while not parens_are_matched(trimmed_url):
+        trimmed_url = trimmed_url[:-1]
+    
+    # make sure trimmed url is still valid
+    if re.findall(URL_REGEX, trimmed_url):
+        return trimmed_url
+    
+    return url_str
+
+def find_all_urls(urls_str: str):
+    for url in re.findall(URL_REGEX, urls_str):
+        yield fix_url_from_markdown(url)
+

 def is_static_file(url: str):
    # TODO: the proper way is with MIME type detection + ext, not only extension
@ -164,9 +203,22 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types
 def download_url(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the text"""
-    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
+    from .config import (
+        TIMEOUT,
+        CHECK_SSL_VALIDITY,
+        WGET_USER_AGENT,
+        COOKIES_FILE,
+    )
    timeout = timeout or TIMEOUT
-    response = requests.get(
+    session = requests.Session()
+
+    if COOKIES_FILE and Path(COOKIES_FILE).is_file():
+        cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
+        cookie_jar.load(ignore_discard=True, ignore_expires=True)
+        for cookie in cookie_jar:
+            session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
+
+    response = session.get(
        url,
        headers={'User-Agent': WGET_USER_AGENT},
        verify=CHECK_SSL_VALIDITY,
@ -179,7 +231,11 @@ def download_url(url: str, timeout: int=None) -> str:
    if encoding is not None:
        response.encoding = encoding

-    return response.text
+    try:
+        return response.text
+    except UnicodeDecodeError:
+        # if response is non-test (e.g. image or other binary files), just return the filename instead
+        return url.rsplit('/', 1)[-1]

@enforce_types
 def get_headers(url: str, timeout: int=None) -> str:
@ -223,7 +279,11 @@ def chrome_args(**options) -> List[str]:

    # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/

-    from .config import CHROME_OPTIONS, CHROME_VERSION
+    from .config import (
+        CHROME_OPTIONS,
+        CHROME_VERSION,
+        CHROME_EXTRA_ARGS,
+    )

    options = {**CHROME_OPTIONS, **options}

@ -232,6 +292,8 @@ def chrome_args(**options) -> List[str]:

    cmd_args = [options['CHROME_BINARY']]

+    cmd_args += CHROME_EXTRA_ARGS
+
    if options['CHROME_HEADLESS']:
        chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
        if chrome_major_version >= 111:
@ -275,8 +337,10 @@ def chrome_args(**options) -> List[str]:

    if options['CHROME_USER_DATA_DIR']:
        cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
-    
-    return cmd_args
+        cmd_args.append('--profile-directory=Default')
+
+    return dedupe(cmd_args)
+

 def chrome_cleanup():
    """
@ -313,6 +377,20 @@ def ansi_to_html(text):
    return COLOR_REGEX.sub(single_sub, text)


+@enforce_types
+def dedupe(options: List[str]) -> List[str]:
+    """
+    Deduplicates the given options. Options that come later clobber earlier
+    conflicting options.
+    """
+    deduped = {}
+
+    for option in options:
+        deduped[option.split('=')[0]] = option
+
+    return list(deduped.values())
+
+
 class AttributeDict(dict):
    """Helper to allow accessing dict values via Example.key or Example['key']"""

@ -359,3 +437,48 @@ class ExtendedEncoder(pyjson.JSONEncoder):

        return pyjson.JSONEncoder.default(self, obj)

+
+### URL PARSING TESTS / ASSERTIONS
+# they run at runtime because I like having them inline in this file,
+# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
+# and these assertions are basically instant, so not a big performance cost to do it on startup
+
+assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
+assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
+
+URL_REGEX_TESTS = [
+    ('https://example.com', ['https://example.com']),
+    ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
+
+    ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
+    ('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
+
+    ('///a',                                                []),
+    ('http://',                                             []),
+    ('http://../',                                          ['http://../']),
+    ('http://-error-.invalid/',                             ['http://-error-.invalid/']),
+    ('https://a(b)c+1#2?3&4/',                              ['https://a(b)c+1#2?3&4/']),
+    ('http://उदाहरण.परीक्षा',                                   ['http://उदाहरण.परीक्षा']),
+    ('http://例子.测试',                                     ['http://例子.测试']),
+    ('http://➡.ws/䨹 htps://abc.1243?234',                  ['http://➡.ws/䨹']),
+    ('http://⌘.ws">https://exa+mple.com//:abc ',            ['http://⌘.ws', 'https://exa+mple.com//:abc']),
+    ('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234',          ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
+    ('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
+    
+    ('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
+    ('http://code.google.com/events/#&product=browser',     ['http://code.google.com/events/#&product=browser']),
+    ('http://foo.bar?q=Spaces should be encoded',           ['http://foo.bar?q=Spaces']),
+    ('http://foo.com/blah_(wikipedia)#c(i)t[e]-1',          ['http://foo.com/blah_(wikipedia)#c(i)t']),
+    ('http://foo.com/(something)?after=parens',             ['http://foo.com/(something)?after=parens']),
+    ('http://foo.com/unicode_(✪)_in_parens) abc',           ['http://foo.com/unicode_(✪)_in_parens']),
+    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
+
+    ('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff',   ['http://a.b/?q=(Test)%20U']),
+    ('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123',  ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
+    ('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3',  ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
+    ('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3',  ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
+    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
+]
+for urls_str, expected_url_matches in URL_REGEX_TESTS:
+    url_matches = list(find_all_urls(urls_str))
+    assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
--- a/archivebox/vendor/requirements.txt
+++ b/archivebox/vendor/requirements.txt
@ -0,0 +1,6 @@
+# this folder contains vendored versions of these packages
+
+atomicwrites==1.4.0
+pocket==0.3.7
+django-taggit==1.3.0
+base32-crockford==0.3.0