From 853685668cae5d3257923838ff462aaf0e75a7aa Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 30 Jul 2020 13:23:10 -0500 Subject: [PATCH 01/15] feat: Add initial support for singlefile extractor --- archivebox/config/__init__.py | 13 +++++ archivebox/extractors/__init__.py | 2 + archivebox/extractors/singlefile.py | 81 +++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 archivebox/extractors/singlefile.py diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 90727e8c..5a747187 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, + 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, + 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -249,6 +252,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, + 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])}, + 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -674,6 +680,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_WGET'], 'is_valid': bool(config['WGET_VERSION']), }, + 'SINGLEFILE_BINARY': { + 'path': bin_path(config['SINGLEFILE_BINARY']), + 'version': config['SINGLEFILE_VERSION'], + 'hash': bin_hash(config['SINGLEFILE_BINARY']), + 'enabled': config['USE_SINGLEFILE'], + 'is_valid': bool(config['SINGLEFILE_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 284ce569..bdeae3d7 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -25,6 +25,7 @@ from ..logging_util import ( from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget +from .singlefile import should_save_singlefile, save_singlefile from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -37,6 +38,7 @@ def get_default_archive_methods(): ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), + ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py new file mode 100644 index 00000000..0c9718e4 --- /dev/null +++ b/archivebox/extractors/singlefile.py @@ -0,0 +1,81 @@ +__package__ = 'archivebox.extractors' + +import os +from pathlib import Path + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, chmod_file +from ..util import ( + enforce_types, +) +from ..config import ( + TIMEOUT, + SAVE_SINGLEFILE, + SINGLEFILE_BINARY, + SINGLEFILE_VERSION, + CHROME_BINARY, +) +from ..logging_util import TimedProgress + + +@enforce_types +def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if not os.path.exists(out_dir): + return False + + return SAVE_SINGLEFILE + + +@enforce_types +def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download full site using single-file""" + + out_dir = out_dir or link.link_dir + output = str(Path(out_dir).absolute() / "single-file.html") + + # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html + cmd = [ + SINGLEFILE_BINARY, + '--browser-executable-path={}'.format(CHROME_BINARY), + link.url, + output + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got single-file response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('SingleFile was not able to archive the page', hints) + chmod_file(output) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=SINGLEFILE_VERSION, + output=output, + status=status, + **timer.stats, + ) From 787a5ad43eb909da83cb189f4153e7e71fc9175b Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 13:07:22 -0500 Subject: [PATCH 02/15] fix: Commit code review suggestions --- archivebox/extractors/singlefile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 0c9718e4..4fa3cf2e 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -23,10 +23,9 @@ from ..logging_util import TimedProgress @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir - if not os.path.exists(out_dir): - return False - return SAVE_SINGLEFILE + output = Path(out_dir or link.link_dir) / 'single-file.html' + return SAVE_SINGLEFILE and (not output.exists()) @enforce_types @@ -36,7 +35,7 @@ def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU out_dir = out_dir or link.link_dir output = str(Path(out_dir).absolute() / "single-file.html") - # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html + # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli cmd = [ SINGLEFILE_BINARY, '--browser-executable-path={}'.format(CHROME_BINARY), From 3d22da39fef4d4205502ff0224db4e9da7f62981 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 30 Jul 2020 17:26:24 -0400 Subject: [PATCH 03/15] Update archivebox/config/__init__.py --- archivebox/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 5a747187..e1e3117f 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -113,7 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, - 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, + 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, From 42b0c804659006a227a7215f97826e326687c399 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 13:51:32 -0500 Subject: [PATCH 04/15] feat: Add singlefile to link_details --- archivebox/extractors/singlefile.py | 1 - archivebox/index/schema.py | 1 + archivebox/themes/legacy/link_details.html | 13 +++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 4fa3cf2e..196765d8 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.extractors' -import os from pathlib import Path from typing import Optional diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 8285e412..cf162f6b 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -406,6 +406,7 @@ class Link: 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), 'wget_path': wget_output_path(self), 'warc_path': 'warc', + 'singlefile_path': 'single-file.html', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/legacy/link_details.html index c5173470..405d933c 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/legacy/link_details.html @@ -79,6 +79,7 @@ .card { overflow: hidden; box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02); + margin-top: 10px; } .card h4 { font-size: 1.4vw; @@ -335,6 +336,18 @@ +
+
+ +
+ + + +

SingleFile

+

archive/single-file.html

+
+
+
From a40e3372806e97bb8dce3975dd3eccfc32685a3c Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 14:11:19 -0500 Subject: [PATCH 05/15] feat: Add link to admin list of files --- archivebox/core/admin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4578cc11..941cedab 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin): '🖥 ' '🅷 ' '🆆 ' + '🗜 ' '📼 ' '📦 ' '🏛 ' @@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin): *link_tuple(link, 'screenshot_path'), *link_tuple(link, 'dom_path'), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), + *link_tuple(link, 'singlefile_path'), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), From b325c0dd9ffbae21a0542321975224412e769bed Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 14:35:42 -0500 Subject: [PATCH 06/15] feat: Add singlefile to latest outputs --- archivebox/index/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index cf162f6b..0824dbde 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -365,6 +365,7 @@ class Link: 'screenshot.png', 'output.html', 'media', + 'single-file.html' ) return any( @@ -376,7 +377,7 @@ class Link: """get the latest output that each archive method produced for link""" ARCHIVE_METHODS = ( - 'title', 'favicon', 'wget', 'warc', 'pdf', + 'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf', 'screenshot', 'dom', 'git', 'media', 'archive_org', ) latest: Dict[str, ArchiveOutput] = {} @@ -392,7 +393,6 @@ class Link: latest[archive_method] = history[0].output else: latest[archive_method] = None - return latest From 91f63635e8a6e20ab07f26dac5c3c62eeaf07e2b Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 14:46:21 -0500 Subject: [PATCH 07/15] feat: Add singlefile in a couple more places --- archivebox/index/schema.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 0824dbde..09476034 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -426,6 +426,7 @@ class Link: 'pdf_path': static_path, 'screenshot_path': static_path, 'dom_path': static_path, + 'singlefile_path': static_path, }) return canonical From 37df00a08b486b247de0bd43af45125fcedcf2c8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 31 Jul 2020 14:49:54 -0500 Subject: [PATCH 08/15] tests: Add basic singlefile test --- tests/test_extractors.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_extractors.py b/tests/test_extractors.py index c7aaaeaf..9b354f08 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -10,4 +10,12 @@ def test_ignore_methods(): Takes the passed method out of the default methods list and returns that value """ ignored = ignore_methods(['title']) - assert should_save_title not in ignored \ No newline at end of file + assert should_save_title not in ignored + + + +def test_singlefile_works(tmp_path, process): + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + output_file = archived_item_path / "single-file.html" + assert output_file.exists() From 5b6eb5e4ad8944ac4b0d936ffa18e04123c6b61c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 1 Aug 2020 11:59:07 -0400 Subject: [PATCH 09/15] make filenames consistent with program name --- archivebox/extractors/singlefile.py | 4 ++-- archivebox/index/schema.py | 5 ++--- archivebox/themes/legacy/link_details.html | 2 +- tests/test_extractors.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 196765d8..60ebdab6 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -23,7 +23,7 @@ from ..logging_util import TimedProgress def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir - output = Path(out_dir or link.link_dir) / 'single-file.html' + output = Path(out_dir or link.link_dir) / 'singlefile.html' return SAVE_SINGLEFILE and (not output.exists()) @@ -32,7 +32,7 @@ def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU """download full site using single-file""" out_dir = out_dir or link.link_dir - output = str(Path(out_dir).absolute() / "single-file.html") + output = str(Path(out_dir).absolute() / "singlefile.html") # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli cmd = [ diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 09476034..2129f5d3 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -365,7 +365,7 @@ class Link: 'screenshot.png', 'output.html', 'media', - 'single-file.html' + 'singlefile.html' ) return any( @@ -406,7 +406,7 @@ class Link: 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), 'wget_path': wget_output_path(self), 'warc_path': 'warc', - 'singlefile_path': 'single-file.html', + 'singlefile_path': 'singlefile.html', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -430,4 +430,3 @@ class Link: }) return canonical - diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/legacy/link_details.html index 405d933c..447552ad 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/legacy/link_details.html @@ -344,7 +344,7 @@

SingleFile

-

archive/single-file.html

+

archive/singlefile.html

diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 9b354f08..fb02044b 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -17,5 +17,5 @@ def test_ignore_methods(): def test_singlefile_works(tmp_path, process): add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - output_file = archived_item_path / "single-file.html" + output_file = archived_item_path / "singlefile.html" assert output_file.exists() From 06d0e9de6cdf1e64a16cc679a153e4a367bd6445 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 3 Aug 2020 13:19:47 -0500 Subject: [PATCH 10/15] feat: Add support for singlefile in docker --- Dockerfile | 45 +++++++++++++++++++---------- archivebox/extractors/singlefile.py | 5 ++++ archivebox/logging_util.py | 1 + 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 97bd1bd1..7d76ea1d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,8 +10,8 @@ FROM python:3.8-slim-buster LABEL name="archivebox" \ - maintainer="Nick Sweeting " \ - description="All-in-one personal internet archiving container" + maintainer="Nick Sweeting " \ + description="All-in-one personal internet archiving container" ENV TZ=UTC \ LANGUAGE=en_US:en \ @@ -22,28 +22,41 @@ ENV TZ=UTC \ APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ CODE_PATH=/app \ VENV_PATH=/venv \ - DATA_PATH=/data + DATA_PATH=/data \ + EXTRA_PATH=/extra -# First install CLI utils and base deps, then Chrome + Fons +# First install CLI utils and base deps, then Chrome + Fons + nodejs RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \ - dumb-init jq git wget curl youtube-dl ffmpeg \ + apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \ + dumb-init jq git wget curl youtube-dl ffmpeg \ && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \ && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ + && curl -sL https://deb.nodesource.com/setup_14.x | bash - \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - google-chrome-stable \ - fontconfig \ - fonts-ipafont-gothic \ - fonts-wqy-zenhei \ - fonts-thai-tlwg \ - fonts-kacst \ - fonts-symbola \ - fonts-noto \ - fonts-freefont-ttf \ - && rm -rf /var/lib/apt/lists/* + google-chrome-stable \ + fontconfig \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-thai-tlwg \ + fonts-kacst \ + fonts-symbola \ + fonts-noto \ + fonts-freefont-ttf \ + nodejs \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +# Clone singlefile and move it to the /bin folder so archivebox can find it + +WORKDIR "$EXTRA_PATH" +RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \ + && unzip -q SingleFile.zip \ + && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \ + && chmod +x SingleFile-master/cli/single-file \ + && ln -s "$EXTRA_PATH/SingleFile-master/cli/single-file" "/bin/single-file" # Run everything from here on out as non-privileged user RUN groupadd --system archivebox \ diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 60ebdab6..3a1a3759 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional +import json from ..index.schema import Link, ArchiveResult, ArchiveError from ..system import run, chmod_file from ..util import ( enforce_types, + chrome_args ) from ..config import ( TIMEOUT, @@ -34,10 +36,13 @@ def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU out_dir = out_dir or link.link_dir output = str(Path(out_dir).absolute() / "singlefile.html") + browser_args = chrome_args(TIMEOUT=0) + # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli cmd = [ SINGLEFILE_BINARY, '--browser-executable-path={}'.format(CHROME_BINARY), + '--browser-args="{}"'.format(json.dumps(browser_args[1:])), link.url, output ] diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index c44f87f1..684f3d80 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str: @enforce_types def printable_dependency_version(name: str, dependency: Dict) -> str: + version = None if dependency['enabled']: if dependency['is_valid']: color, symbol, note, version = 'green', '√', 'valid', '' From 3c5c6a689e8b5b4d75ca4791c3b9266f29097254 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 4 Aug 2020 07:35:58 -0500 Subject: [PATCH 11/15] fix: Add missing configuration variable to be able to disable singlefile --- archivebox/config/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index e1e3117f..826f9824 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -254,6 +254,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, From 5429096c305500f586abfd44b4450e9917785c3b Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 4 Aug 2020 08:42:30 -0500 Subject: [PATCH 12/15] tests: Add mechanism to avoid using extractors that we are not testing --- tests/fixtures.py | 17 ++++++++++++++++- tests/test_args.py | 23 ++++++++++++++--------- tests/test_extractors.py | 14 ++++++++------ tests/test_init.py | 15 +++++++++------ tests/test_oneshot.py | 9 +++++---- tests/test_remove.py | 4 ++-- tests/test_title.py | 5 +++-- 7 files changed, 57 insertions(+), 30 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 9bf2640a..3d8dabfe 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,4 +7,19 @@ import pytest def process(tmp_path): os.chdir(tmp_path) process = subprocess.run(['archivebox', 'init'], capture_output=True) - return process \ No newline at end of file + return process + +@pytest.fixture +def disable_extractors_dict(): + env = os.environ.copy() + env.update({ + "USE_WGET": "false", + "USE_SINGLEFILE": "false", + "SAVE_PDF": "false", + "SAVE_SCREENSHOT": "false", + "SAVE_DOM": "false", + "USE_GIT": "false", + "SAVE_MEDIA": "false", + "SAVE_ARCHIVE_DOT_ORG": "false" + }) + return env \ No newline at end of file diff --git a/tests/test_args.py b/tests/test_args.py index ed132524..c616cb80 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -3,25 +3,30 @@ import json from .fixtures import * -def test_depth_flag_is_accepted(process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) +def test_depth_flag_is_accepted(process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], + capture_output=True, env=disable_extractors_dict) assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") -def test_depth_flag_fails_if_it_is_not_0_or_1(process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True) +def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], + capture_output=True, env=disable_extractors_dict) assert 'invalid choice' in arg_process.stderr.decode("utf-8") - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], + capture_output=True, env=disable_extractors_dict) assert 'invalid choice' in arg_process.stderr.decode("utf-8") -def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) +def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" -def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True) +def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict): + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], + capture_output=True, env=disable_extractors_dict) with open(tmp_path / "index.json", "r") as f: archive_file = f.read() assert "http://127.0.0.1:8080/static/example.com.html" in archive_file diff --git a/tests/test_extractors.py b/tests/test_extractors.py index fb02044b..ffb933c1 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -1,8 +1,10 @@ from .fixtures import * from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title -def test_wget_broken_pipe(tmp_path, process): - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) +def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") def test_ignore_methods(): @@ -12,10 +14,10 @@ def test_ignore_methods(): ignored = ignore_methods(['title']) assert should_save_title not in ignored - - -def test_singlefile_works(tmp_path, process): - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) +def test_singlefile_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] output_file = archived_item_path / "singlefile.html" assert output_file.exists() diff --git a/tests/test_init.py b/tests/test_init.py index 133aaaa9..bd1ad516 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,9 +18,10 @@ def test_update(tmp_path, process): update_process = subprocess.run(['archivebox', 'init'], capture_output=True) assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") -def test_add_link(tmp_path, process): +def test_add_link(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] @@ -33,9 +34,10 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html -def test_add_link_support_stdin(tmp_path, process): +def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + env=disable_extractors_dict) stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode()) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] @@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process): file_path = tmp_path / file assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS -def test_correct_permissions_add_command_results(tmp_path, process): +def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, + env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] for path in archived_item_path.iterdir(): assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py index 7ff9867f..66a567e1 100644 --- a/tests/test_oneshot.py +++ b/tests/test_oneshot.py @@ -2,13 +2,14 @@ from pathlib import Path from .fixtures import * -def test_oneshot_command_exists(tmp_path): +def test_oneshot_command_exists(tmp_path, disable_extractors_dict): os.chdir(tmp_path) - process = subprocess.run(['archivebox', 'oneshot'], capture_output=True) + process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict) assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8") -def test_oneshot_commad_saves_page_in_right_folder(tmp_path): - process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True) +def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict): + process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], + capture_output=True, env=disable_extractors_dict) items = ' '.join([str(x) for x in tmp_path.iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) assert "index.json" in items diff --git a/tests/test_remove.py b/tests/test_remove.py index 040dafdc..d26c96bb 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -1,8 +1,8 @@ from .fixtures import * -def test_remove_leaves_index_in_consistent_state(tmp_path, process): +def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) list_process = subprocess.run(['archivebox', 'list'], capture_output=True) assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8") \ No newline at end of file diff --git a/tests/test_title.py b/tests/test_title.py index b5090844..24b2cc28 100644 --- a/tests/test_title.py +++ b/tests/test_title.py @@ -1,12 +1,13 @@ from .fixtures import * -def test_title_is_htmlencoded_in_index_html(tmp_path, process): +def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict): """ https://github.com/pirate/ArchiveBox/issues/330 Unencoded content should not be rendered as it facilitates xss injections and breaks the layout. """ - add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], + capture_output=True, env=disable_extractors_dict) with open(tmp_path / "index.html", "r") as f: output_html = f.read() From 02afd948dc35f079a63807f1145f2ff3efc697f2 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 4 Aug 2020 09:01:17 -0500 Subject: [PATCH 13/15] tests: Add singlefile steps --- .github/workflows/test.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9fccf5da..117a760a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,6 +40,22 @@ jobs: with: fetch-depth: 1 + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + repository: "gildas-lormeau/SingleFile" + ref: "master" + path: "singlefile" + + - name: Install npm requirements for singlefile + run: npm install --prefix singlefile/cli + + - name: Give singlefile execution permissions + run: chmod +x singlefile/cli/single-file + + - name: Add singlefile cli folder to path + run: echo "::add-path::$GITHUB_WORKSPACE/singlefile/cli" + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v1 with: @@ -55,6 +71,14 @@ jobs: restore-keys: | ${{ runner.os }}-${{ matrix.python }}-venv- + - name: Use nodejs 14.7.0 + uses: actions/setup-node@v1 + with: + node-version: 14.7.0 + + - name: Debug + run: ls ./ + - name: Install dependencies run: | python -m pip install . From e2c4e6fff98b3f94ffc9403da37289afeeca1ec6 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 4 Aug 2020 11:50:01 -0500 Subject: [PATCH 14/15] refactor: Dockerfile uses env to point to the singlefile binary instead of adding it to a path folder --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7d76ea1d..460175d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,8 +55,7 @@ WORKDIR "$EXTRA_PATH" RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \ && unzip -q SingleFile.zip \ && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \ - && chmod +x SingleFile-master/cli/single-file \ - && ln -s "$EXTRA_PATH/SingleFile-master/cli/single-file" "/bin/single-file" + && chmod +x SingleFile-master/cli/single-file # Run everything from here on out as non-privileged user RUN groupadd --system archivebox \ @@ -73,7 +72,8 @@ VOLUME "$DATA_PATH" WORKDIR "$DATA_PATH" EXPOSE 8000 ENV CHROME_BINARY=google-chrome \ - CHROME_SANDBOX=False + CHROME_SANDBOX=False \ + SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" RUN env ALLOW_ROOT=True archivebox version From 733075cbd0b91637559436117ccfb19b2467ebca Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 4 Aug 2020 11:53:31 -0500 Subject: [PATCH 15/15] tests: Set SINGLEFILE_BINARY instead of putting the binary in the path --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 117a760a..e689b7c5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,8 +53,8 @@ jobs: - name: Give singlefile execution permissions run: chmod +x singlefile/cli/single-file - - name: Add singlefile cli folder to path - run: echo "::add-path::$GITHUB_WORKSPACE/singlefile/cli" + - name: Set SINGLEFILE_BINARY + run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file" - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v1