diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 492cf334..65ba19a6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,6 +40,22 @@ jobs: with: fetch-depth: 1 + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + repository: "gildas-lormeau/SingleFile" + ref: "master" + path: "singlefile" + + - name: Install npm requirements for singlefile + run: npm install --prefix singlefile/cli + + - name: Give singlefile execution permissions + run: chmod +x singlefile/cli/single-file + + - name: Set SINGLEFILE_BINARY + run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file" + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v1 with: @@ -60,6 +76,14 @@ jobs: restore-keys: | ${{ runner.os }}-${{ matrix.python }}-venv- + - name: Use nodejs 14.7.0 + uses: actions/setup-node@v1 + with: + node-version: 14.7.0 + + - name: Debug + run: ls ./ + - name: Install dependencies run: | python -m pip install . diff --git a/Dockerfile b/Dockerfile index 97bd1bd1..460175d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,8 +10,8 @@ FROM python:3.8-slim-buster LABEL name="archivebox" \ - maintainer="Nick Sweeting " \ - description="All-in-one personal internet archiving container" + maintainer="Nick Sweeting " \ + description="All-in-one personal internet archiving container" ENV TZ=UTC \ LANGUAGE=en_US:en \ @@ -22,28 +22,40 @@ ENV TZ=UTC \ APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ CODE_PATH=/app \ VENV_PATH=/venv \ - DATA_PATH=/data + DATA_PATH=/data \ + EXTRA_PATH=/extra -# First install CLI utils and base deps, then Chrome + Fons +# First install CLI utils and base deps, then Chrome + Fons + nodejs RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \ - dumb-init jq git wget curl youtube-dl ffmpeg \ + apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \ + dumb-init jq git wget curl youtube-dl ffmpeg \ && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \ && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ + && curl -sL https://deb.nodesource.com/setup_14.x | bash - \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - google-chrome-stable \ - fontconfig \ - fonts-ipafont-gothic \ - fonts-wqy-zenhei \ - fonts-thai-tlwg \ - fonts-kacst \ - fonts-symbola \ - fonts-noto \ - fonts-freefont-ttf \ - && rm -rf /var/lib/apt/lists/* + google-chrome-stable \ + fontconfig \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-thai-tlwg \ + fonts-kacst \ + fonts-symbola \ + fonts-noto \ + fonts-freefont-ttf \ + nodejs \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +# Clone singlefile and move it to the /bin folder so archivebox can find it + +WORKDIR "$EXTRA_PATH" +RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \ + && unzip -q SingleFile.zip \ + && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \ + && chmod +x SingleFile-master/cli/single-file # Run everything from here on out as non-privileged user RUN groupadd --system archivebox \ @@ -60,7 +72,8 @@ VOLUME "$DATA_PATH" WORKDIR "$DATA_PATH" EXPOSE 8000 ENV CHROME_BINARY=google-chrome \ - CHROME_SANDBOX=False + CHROME_SANDBOX=False \ + SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" RUN env ALLOW_ROOT=True archivebox version diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 90727e8c..826f9824 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, + 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, + 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, + 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])}, + 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_WGET'], 'is_valid': bool(config['WGET_VERSION']), }, + 'SINGLEFILE_BINARY': { + 'path': bin_path(config['SINGLEFILE_BINARY']), + 'version': config['SINGLEFILE_VERSION'], + 'hash': bin_hash(config['SINGLEFILE_BINARY']), + 'enabled': config['USE_SINGLEFILE'], + 'is_valid': bool(config['SINGLEFILE_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4578cc11..941cedab 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin): '🖥 ' '🅷 ' '🆆 ' + '🗜 ' '📼 ' '📦 ' '🏛 ' @@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin): *link_tuple(link, 'screenshot_path'), *link_tuple(link, 'dom_path'), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), + *link_tuple(link, 'singlefile_path'), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 284ce569..bdeae3d7 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -25,6 +25,7 @@ from ..logging_util import ( from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget +from .singlefile import should_save_singlefile, save_singlefile from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -37,6 +38,7 @@ def get_default_archive_methods(): ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), + ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py new file mode 100644 index 00000000..3a1a3759 --- /dev/null +++ b/archivebox/extractors/singlefile.py @@ -0,0 +1,84 @@ +__package__ = 'archivebox.extractors' + +from pathlib import Path + +from typing import Optional +import json + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, chmod_file +from ..util import ( + enforce_types, + chrome_args +) +from ..config import ( + TIMEOUT, + SAVE_SINGLEFILE, + SINGLEFILE_BINARY, + SINGLEFILE_VERSION, + CHROME_BINARY, +) +from ..logging_util import TimedProgress + + +@enforce_types +def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + + output = Path(out_dir or link.link_dir) / 'singlefile.html' + return SAVE_SINGLEFILE and (not output.exists()) + + +@enforce_types +def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download full site using single-file""" + + out_dir = out_dir or link.link_dir + output = str(Path(out_dir).absolute() / "singlefile.html") + + browser_args = chrome_args(TIMEOUT=0) + + # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli + cmd = [ + SINGLEFILE_BINARY, + '--browser-executable-path={}'.format(CHROME_BINARY), + '--browser-args="{}"'.format(json.dumps(browser_args[1:])), + link.url, + output + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got single-file response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('SingleFile was not able to archive the page', hints) + chmod_file(output) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=SINGLEFILE_VERSION, + output=output, + status=status, + **timer.stats, + ) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 8285e412..2129f5d3 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -365,6 +365,7 @@ class Link: 'screenshot.png', 'output.html', 'media', + 'singlefile.html' ) return any( @@ -376,7 +377,7 @@ class Link: """get the latest output that each archive method produced for link""" ARCHIVE_METHODS = ( - 'title', 'favicon', 'wget', 'warc', 'pdf', + 'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf', 'screenshot', 'dom', 'git', 'media', 'archive_org', ) latest: Dict[str, ArchiveOutput] = {} @@ -392,7 +393,6 @@ class Link: latest[archive_method] = history[0].output else: latest[archive_method] = None - return latest @@ -406,6 +406,7 @@ class Link: 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), 'wget_path': wget_output_path(self), 'warc_path': 'warc', + 'singlefile_path': 'singlefile.html', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -425,7 +426,7 @@ class Link: 'pdf_path': static_path, 'screenshot_path': static_path, 'dom_path': static_path, + 'singlefile_path': static_path, }) return canonical - diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index c44f87f1..684f3d80 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str: @enforce_types def printable_dependency_version(name: str, dependency: Dict) -> str: + version = None if dependency['enabled']: if dependency['is_valid']: color, symbol, note, version = 'green', '√', 'valid', '' diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/legacy/link_details.html index c5173470..447552ad 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/legacy/link_details.html @@ -79,6 +79,7 @@ .card { overflow: hidden; box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02); + margin-top: 10px; } .card h4 { font-size: 1.4vw; @@ -335,6 +336,18 @@ +
+
+ +
+ + + +

SingleFile

+

archive/singlefile.html

+
+
+
diff --git a/tests/fixtures.py b/tests/fixtures.py index 9bf2640a..3d8dabfe 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,4 +7,19 @@ import pytest def process(tmp_path): os.chdir(tmp_path) process = subprocess.run(['archivebox', 'init'], capture_output=True) - return process \ No newline at end of file + return process + +@pytest.fixture +def disable_extractors_dict(): + env = os.environ.copy() + env.update({ + "USE_WGET": "false", + "USE_SINGLEFILE": "false", + "SAVE_PDF": "false", + "SAVE_SCREENSHOT": "false", + "SAVE_DOM": "false", + "USE_GIT": "false", + "SAVE_MEDIA": "false", + "SAVE_ARCHIVE_DOT_ORG": "false" + }) + return env \ No newline at end of file diff --git a/tests/test_args.py b/tests/test_args.py index ed132524..c616cb80 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -3,25 +3,30 @@ import json from .fixtures import * -def test_depth_flag_is_accepted(process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) +def test_depth_flag_is_accepted(process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], + capture_output=True, env=disable_extractors_dict) assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") -def test_depth_flag_fails_if_it_is_not_0_or_1(process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True) +def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], + capture_output=True, env=disable_extractors_dict) assert 'invalid choice' in arg_process.stderr.decode("utf-8") - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], + capture_output=True, env=disable_extractors_dict) assert 'invalid choice' in arg_process.stderr.decode("utf-8") -def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) +def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" -def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True) +def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], + capture_output=True, env=disable_extractors_dict) with open(tmp_path / "index.json", "r") as f: archive_file = f.read() assert "http://127.0.0.1:8080/static/example.com.html" in archive_file diff --git a/tests/test_extractors.py b/tests/test_extractors.py index c7aaaeaf..ffb933c1 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -1,8 +1,10 @@ from .fixtures import * from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title -def test_wget_broken_pipe(tmp_path, process): - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) +def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") def test_ignore_methods(): @@ -10,4 +12,12 @@ def test_ignore_methods(): Takes the passed method out of the default methods list and returns that value """ ignored = ignore_methods(['title']) - assert should_save_title not in ignored \ No newline at end of file + assert should_save_title not in ignored + +def test_singlefile_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + output_file = archived_item_path / "singlefile.html" + assert output_file.exists() diff --git a/tests/test_init.py b/tests/test_init.py index 133aaaa9..bd1ad516 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,9 +18,10 @@ def test_update(tmp_path, process): update_process = subprocess.run(['archivebox', 'init'], capture_output=True) assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") -def test_add_link(tmp_path, process): +def test_add_link(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] @@ -33,9 +34,10 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html -def test_add_link_support_stdin(tmp_path, process): +def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + env=disable_extractors_dict) stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode()) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] @@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process): file_path = tmp_path / file assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS -def test_correct_permissions_add_command_results(tmp_path, process): +def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, + env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] for path in archived_item_path.iterdir(): assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py index 7ff9867f..66a567e1 100644 --- a/tests/test_oneshot.py +++ b/tests/test_oneshot.py @@ -2,13 +2,14 @@ from pathlib import Path from .fixtures import * -def test_oneshot_command_exists(tmp_path): +def test_oneshot_command_exists(tmp_path, disable_extractors_dict): os.chdir(tmp_path) - process = subprocess.run(['archivebox', 'oneshot'], capture_output=True) + process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict) assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8") -def test_oneshot_commad_saves_page_in_right_folder(tmp_path): - process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True) +def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict): + process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], + capture_output=True, env=disable_extractors_dict) items = ' '.join([str(x) for x in tmp_path.iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) assert "index.json" in items diff --git a/tests/test_remove.py b/tests/test_remove.py index 040dafdc..d26c96bb 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -1,8 +1,8 @@ from .fixtures import * -def test_remove_leaves_index_in_consistent_state(tmp_path, process): +def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) list_process = subprocess.run(['archivebox', 'list'], capture_output=True) assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8") \ No newline at end of file diff --git a/tests/test_title.py b/tests/test_title.py index b5090844..24b2cc28 100644 --- a/tests/test_title.py +++ b/tests/test_title.py @@ -1,12 +1,13 @@ from .fixtures import * -def test_title_is_htmlencoded_in_index_html(tmp_path, process): +def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict): """ https://github.com/pirate/ArchiveBox/issues/330 Unencoded content should not be rendered as it facilitates xss injections and breaks the layout. """ - add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], + capture_output=True, env=disable_extractors_dict) with open(tmp_path / "index.html", "r") as f: output_html = f.read()