Merge pull request #403 from cdvv7788/single-file

2025-05-24 11:47:04 -04:00 · 2020-08-06 22:43:47 -04:00 · 2020-08-06 22:43:47 -04:00 · c8e3aed647
commit c8e3aed647
parent 98cdea871b 733075cbd0
16 changed files with 236 additions and 47 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -40,6 +40,22 @@ jobs:
        with:
          fetch-depth: 1

+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+          repository: "gildas-lormeau/SingleFile"
+          ref: "master"
+          path: "singlefile"
+
+      - name: Install npm requirements for singlefile
+        run: npm install --prefix singlefile/cli
+
+      - name: Give singlefile execution permissions
+        run: chmod +x singlefile/cli/single-file
+
+      - name: Set SINGLEFILE_BINARY
+        run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
+
      - name: Set up Python ${{ matrix.python }}
        uses: actions/setup-python@v1
        with:
@ -60,6 +76,14 @@ jobs:
          restore-keys: |
            ${{ runner.os }}-${{ matrix.python }}-venv-

+      - name: Use nodejs 14.7.0
+        uses: actions/setup-node@v1
+        with:
+          node-version: 14.7.0
+
+      - name: Debug
+        run: ls ./
+
      - name: Install dependencies
        run: |
          python -m pip install .
--- a/19
+++ b/19
@ -22,9 +22,10 @@ ENV TZ=UTC \
    APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
    CODE_PATH=/app \
    VENV_PATH=/venv \
-    DATA_PATH=/data
+    DATA_PATH=/data \
+    EXTRA_PATH=/extra

-# First install CLI utils and base deps, then Chrome + Fons
+# First install CLI utils and base deps, then Chrome + Fons + nodejs
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
    && apt-get update -qq \
    && apt-get install -qq -y --no-install-recommends \
@ -32,6 +33,7 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio
    dumb-init jq git wget curl youtube-dl ffmpeg \
    && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
    && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
+    && curl -sL https://deb.nodesource.com/setup_14.x | bash - \
    && apt-get update -qq \
    && apt-get install -qq -y --no-install-recommends \
    google-chrome-stable \
@ -43,8 +45,18 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio
    fonts-symbola \
    fonts-noto \
    fonts-freefont-ttf \
+    nodejs \
+    unzip \
    && rm -rf /var/lib/apt/lists/* 

+# Clone singlefile and move it to the /bin folder so archivebox can find it
+
+WORKDIR "$EXTRA_PATH"
+RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
+    && unzip -q SingleFile.zip \
+    && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
+    && chmod +x SingleFile-master/cli/single-file 
+
 # Run everything from here on out as non-privileged user
 RUN groupadd --system archivebox \
    && useradd --system --create-home --gid archivebox --groups audio,video archivebox
@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
 WORKDIR "$DATA_PATH"
 EXPOSE 8000
 ENV CHROME_BINARY=google-chrome \
-    CHROME_SANDBOX=False
+    CHROME_SANDBOX=False \
+    SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"

 RUN env ALLOW_ROOT=True archivebox version

--- a/archivebox/config/init.py
+++ b/archivebox/config/init.py
@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'SAVE_FAVICON':             {'type': bool,  'default': True, 'aliases': ('FETCH_FAVICON',)},
        'SAVE_WGET':                {'type': bool,  'default': True, 'aliases': ('FETCH_WGET',)},
        'SAVE_WGET_REQUISITES':     {'type': bool,  'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
+        'SAVE_SINGLEFILE':          {'type': bool,  'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
        'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
        'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
        'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
    'DEPENDENCY_CONFIG': {
        'USE_CURL':                 {'type': bool,  'default': True},
        'USE_WGET':                 {'type': bool,  'default': True},
+        'USE_SINGLEFILE':           {'type': bool,  'default': True},
        'USE_GIT':                  {'type': bool,  'default': True},
        'USE_CHROME':               {'type': bool,  'default': True},
        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
        'WGET_BINARY':              {'type': str,   'default': 'wget'},
+        'SINGLEFILE_BINARY':        {'type': str,   'default': 'single-file'},
        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
        'CHROME_BINARY':            {'type': str,   'default': None},
    },
@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
    'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
    'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},

+    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
+    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
+    'SAVE_SINGLEFILE':          {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
+
    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
    'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
            'enabled': config['USE_WGET'],
            'is_valid': bool(config['WGET_VERSION']),
        },
+        'SINGLEFILE_BINARY': {
+            'path': bin_path(config['SINGLEFILE_BINARY']),
+            'version': config['SINGLEFILE_VERSION'],
+            'hash': bin_hash(config['SINGLEFILE_BINARY']),
+            'enabled': config['USE_SINGLEFILE'],
+            'is_valid': bool(config['SINGLEFILE_VERSION']),
+        },
        'GIT_BINARY': {
            'path': bin_path(config['GIT_BINARY']),
            'version': config['GIT_VERSION'],
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
                '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
                '<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
                '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
            *link_tuple(link, 'screenshot_path'),
            *link_tuple(link, 'dom_path'),
            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+            *link_tuple(link, 'singlefile_path'),
            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -25,6 +25,7 @@ from ..logging_util import (
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
 from .wget import should_save_wget, save_wget
+from .singlefile import should_save_singlefile, save_singlefile
 from .pdf import should_save_pdf, save_pdf
 from .screenshot import should_save_screenshot, save_screenshot
 from .dom import should_save_dom, save_dom
@ -37,6 +38,7 @@ def get_default_archive_methods():
            ('title', should_save_title, save_title),
            ('favicon', should_save_favicon, save_favicon),
            ('wget', should_save_wget, save_wget),
+            ('singlefile', should_save_singlefile, save_singlefile),
            ('pdf', should_save_pdf, save_pdf),
            ('screenshot', should_save_screenshot, save_screenshot),
            ('dom', should_save_dom, save_dom),
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@ -0,0 +1,84 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+
+from typing import Optional
+import json
+
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from ..system import run, chmod_file
+from ..util import (
+    enforce_types,
+    chrome_args
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_SINGLEFILE,
+    SINGLEFILE_BINARY,
+    SINGLEFILE_VERSION,
+    CHROME_BINARY,
+)
+from ..logging_util import TimedProgress
+
+
+@enforce_types
+def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+
+    output = Path(out_dir or link.link_dir) / 'singlefile.html'
+    return SAVE_SINGLEFILE and (not output.exists())
+
+
+@enforce_types
+def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """download full site using single-file"""
+
+    out_dir = out_dir or link.link_dir
+    output = str(Path(out_dir).absolute() / "singlefile.html")
+
+    browser_args = chrome_args(TIMEOUT=0)
+
+    # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
+    cmd = [
+        SINGLEFILE_BINARY,
+        '--browser-executable-path={}'.format(CHROME_BINARY),
+        '--browser-args="{}"'.format(json.dumps(browser_args[1:])),
+        link.url,
+        output
+    ]
+
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=out_dir, timeout=timeout)
+
+        # parse out number of files downloaded from last line of stderr:
+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            if line.strip()
+        ]
+        hints = (
+            'Got single-file response code: {}.'.format(result.returncode),
+            *output_tail,
+        )
+
+        # Check for common failure cases
+        if (result.returncode > 0):
+            raise ArchiveError('SingleFile was not able to archive the page', hints)
+        chmod_file(output)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=SINGLEFILE_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@ -365,6 +365,7 @@ class Link:
            'screenshot.png',
            'output.html',
            'media',
+            'singlefile.html'
        )

        return any(
@ -376,7 +377,7 @@ class Link:
        """get the latest output that each archive method produced for link"""
        
        ARCHIVE_METHODS = (
-            'title', 'favicon', 'wget', 'warc', 'pdf',
+            'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
            'screenshot', 'dom', 'git', 'media', 'archive_org',
        )
        latest: Dict[str, ArchiveOutput] = {}
@ -392,7 +393,6 @@ class Link:
                latest[archive_method] = history[0].output
            else:
                latest[archive_method] = None
-
        return latest


@ -406,6 +406,7 @@ class Link:
            'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
            'wget_path': wget_output_path(self),
            'warc_path': 'warc',
+            'singlefile_path': 'singlefile.html',
            'pdf_path': 'output.pdf',
            'screenshot_path': 'screenshot.png',
            'dom_path': 'output.html',
@ -425,7 +426,7 @@ class Link:
                'pdf_path': static_path,
                'screenshot_path': static_path,
                'dom_path': static_path,
+                'singlefile_path': static_path,
            })
        return canonical

-
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:

@enforce_types
 def printable_dependency_version(name: str, dependency: Dict) -> str:
+    version = None
    if dependency['enabled']:
        if dependency['is_valid']:
            color, symbol, note, version = 'green', '√', 'valid', ''
--- a/archivebox/themes/legacy/link_details.html
+++ b/archivebox/themes/legacy/link_details.html
@ -79,6 +79,7 @@
            .card {
                overflow: hidden;
                box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
+                margin-top: 10px;
            }
            .card h4 {
                font-size: 1.4vw;
@ -335,6 +336,18 @@
                          </div>
                        </div>
                    </div>
+                    <div class="col-lg-2">
+                        <div class="card">
+                          <iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <div class="card-body">
+                            <a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                                <img src="../../static/external.png" class="external"/>
+                            </a>
+                            <a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
+                            <p class="card-text">archive/singlefile.html</p>
+                          </div>
+                        </div>
+                    </div>
                    <div class="col-lg-2">
                        <div class="card">
                          <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@ -8,3 +8,18 @@ def process(tmp_path):
    os.chdir(tmp_path)
    process = subprocess.run(['archivebox', 'init'], capture_output=True)
    return process
+
+@pytest.fixture
+def disable_extractors_dict():
+    env = os.environ.copy()
+    env.update({
+        "USE_WGET": "false",
+        "USE_SINGLEFILE": "false",
+        "SAVE_PDF": "false",
+        "SAVE_SCREENSHOT": "false",
+        "SAVE_DOM": "false",
+        "USE_GIT": "false",
+        "SAVE_MEDIA": "false",
+        "SAVE_ARCHIVE_DOT_ORG": "false"
+    })
+    return env
--- a/tests/test_args.py
+++ b/tests/test_args.py
@ -3,25 +3,30 @@ import json

 from .fixtures import *

-def test_depth_flag_is_accepted(process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
+def test_depth_flag_is_accepted(process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
+                                  capture_output=True, env=disable_extractors_dict)
    assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")

-def test_depth_flag_fails_if_it_is_not_0_or_1(process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True)
+def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
+                                  capture_output=True, env=disable_extractors_dict)
    assert 'invalid choice' in arg_process.stderr.decode("utf-8")
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True)
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
+                                  capture_output=True, env=disable_extractors_dict)
    assert 'invalid choice' in arg_process.stderr.decode("utf-8")

-def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
+def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
+                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
    with open(archived_item_path / "index.json", "r") as f:
        output_json = json.load(f)
    assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"

-def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True)
+def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
+                                  capture_output=True, env=disable_extractors_dict)
    with open(tmp_path / "index.json", "r") as f:
        archive_file = f.read()
    assert "http://127.0.0.1:8080/static/example.com.html" in archive_file
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@ -1,8 +1,10 @@
 from .fixtures import *
 from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title

-def test_wget_broken_pipe(tmp_path, process):
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"USE_WGET": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                 capture_output=True, env=disable_extractors_dict)
    assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")

 def test_ignore_methods():
@ -11,3 +13,11 @@ def test_ignore_methods():
    """
    ignored = ignore_methods(['title'])
    assert should_save_title not in ignored
+
+def test_singlefile_works(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    output_file = archived_item_path / "singlefile.html" 
+    assert output_file.exists()
--- a/tests/test_init.py
+++ b/tests/test_init.py
@ -18,9 +18,10 @@ def test_update(tmp_path, process):
    update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
    assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")

-def test_add_link(tmp_path, process):
+def test_add_link(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]

    assert "index.json" in [x.name for x in archived_item_path.iterdir()]
@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
        output_html = f.read()
    assert "Example Domain" in output_html

-def test_add_link_support_stdin(tmp_path, process):
+def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
-    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                                      env=disable_extractors_dict)
    stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]

@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
        file_path = tmp_path / file
        assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS

-def test_correct_permissions_add_command_results(tmp_path, process):
+def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                                  env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
    for path in archived_item_path.iterdir():
        assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
--- a/tests/test_oneshot.py
+++ b/tests/test_oneshot.py
@ -2,13 +2,14 @@ from pathlib import Path

 from .fixtures import *

-def test_oneshot_command_exists(tmp_path):
+def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
    os.chdir(tmp_path)
-    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True)
+    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
    assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")

-def test_oneshot_commad_saves_page_in_right_folder(tmp_path):
-    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True)
+def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
+    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
+                              capture_output=True, env=disable_extractors_dict)
    items = ' '.join([str(x) for x in tmp_path.iterdir()])
    current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
    assert "index.json" in items
--- a/tests/test_remove.py
+++ b/tests/test_remove.py
@ -1,8 +1,8 @@
 from .fixtures import *

-def test_remove_leaves_index_in_consistent_state(tmp_path, process):
+def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
-    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
    remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
    list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
    assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")
--- a/tests/test_title.py
+++ b/tests/test_title.py
@ -1,12 +1,13 @@
 from .fixtures import *

-def test_title_is_htmlencoded_in_index_html(tmp_path, process):
+def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
    """
    https://github.com/pirate/ArchiveBox/issues/330
    Unencoded content should not be rendered as it facilitates xss injections
    and breaks the layout.
    """
-    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
+                                 capture_output=True, env=disable_extractors_dict)

    with open(tmp_path / "index.html", "r") as f:
        output_html = f.read()