Merge pull request #403 from cdvv7788/single-file

This commit is contained in:
Nick Sweeting 2020-08-06 22:43:47 -04:00 committed by GitHub
commit c8e3aed647
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 236 additions and 47 deletions

View file

@ -40,6 +40,22 @@ jobs:
with: with:
fetch-depth: 1 fetch-depth: 1
- uses: actions/checkout@v2
with:
fetch-depth: 1
repository: "gildas-lormeau/SingleFile"
ref: "master"
path: "singlefile"
- name: Install npm requirements for singlefile
run: npm install --prefix singlefile/cli
- name: Give singlefile execution permissions
run: chmod +x singlefile/cli/single-file
- name: Set SINGLEFILE_BINARY
run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v1 uses: actions/setup-python@v1
with: with:
@ -60,6 +76,14 @@ jobs:
restore-keys: | restore-keys: |
${{ runner.os }}-${{ matrix.python }}-venv- ${{ runner.os }}-${{ matrix.python }}-venv-
- name: Use nodejs 14.7.0
uses: actions/setup-node@v1
with:
node-version: 14.7.0
- name: Debug
run: ls ./
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install . python -m pip install .

View file

@ -10,8 +10,8 @@
FROM python:3.8-slim-buster FROM python:3.8-slim-buster
LABEL name="archivebox" \ LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \ maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \
description="All-in-one personal internet archiving container" description="All-in-one personal internet archiving container"
ENV TZ=UTC \ ENV TZ=UTC \
LANGUAGE=en_US:en \ LANGUAGE=en_US:en \
@ -22,28 +22,40 @@ ENV TZ=UTC \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
CODE_PATH=/app \ CODE_PATH=/app \
VENV_PATH=/venv \ VENV_PATH=/venv \
DATA_PATH=/data DATA_PATH=/data \
EXTRA_PATH=/extra
# First install CLI utils and base deps, then Chrome + Fons # First install CLI utils and base deps, then Chrome + Fons + nodejs
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
&& apt-get update -qq \ && apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \ && apt-get install -qq -y --no-install-recommends \
apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \ apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
dumb-init jq git wget curl youtube-dl ffmpeg \ dumb-init jq git wget curl youtube-dl ffmpeg \
&& curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \ && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
&& echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
&& curl -sL https://deb.nodesource.com/setup_14.x | bash - \
&& apt-get update -qq \ && apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \ && apt-get install -qq -y --no-install-recommends \
google-chrome-stable \ google-chrome-stable \
fontconfig \ fontconfig \
fonts-ipafont-gothic \ fonts-ipafont-gothic \
fonts-wqy-zenhei \ fonts-wqy-zenhei \
fonts-thai-tlwg \ fonts-thai-tlwg \
fonts-kacst \ fonts-kacst \
fonts-symbola \ fonts-symbola \
fonts-noto \ fonts-noto \
fonts-freefont-ttf \ fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/* nodejs \
unzip \
&& rm -rf /var/lib/apt/lists/*
# Clone singlefile and move it to the /bin folder so archivebox can find it
WORKDIR "$EXTRA_PATH"
RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
&& unzip -q SingleFile.zip \
&& npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
&& chmod +x SingleFile-master/cli/single-file
# Run everything from here on out as non-privileged user # Run everything from here on out as non-privileged user
RUN groupadd --system archivebox \ RUN groupadd --system archivebox \
@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
WORKDIR "$DATA_PATH" WORKDIR "$DATA_PATH"
EXPOSE 8000 EXPOSE 8000
ENV CHROME_BINARY=google-chrome \ ENV CHROME_BINARY=google-chrome \
CHROME_SANDBOX=False CHROME_SANDBOX=False \
SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"
RUN env ALLOW_ROOT=True archivebox version RUN env ALLOW_ROOT=True archivebox version

View file

@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'DEPENDENCY_CONFIG': { 'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True}, 'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True},
'USE_SINGLEFILE': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True},
@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'}, 'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'}, 'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, 'WGET_BINARY': {'type': str, 'default': 'wget'},
'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'CHROME_BINARY': {'type': str, 'default': None}, 'CHROME_BINARY': {'type': str, 'default': None},
}, },
@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_WGET'], 'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']), 'is_valid': bool(config['WGET_VERSION']),
}, },
'SINGLEFILE_BINARY': {
'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['SINGLEFILE_VERSION']),
},
'GIT_BINARY': { 'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']), 'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'], 'version': config['GIT_VERSION'],

View file

@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> ' '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> ' '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> ' '<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> ' '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> ' '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> ' '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
*link_tuple(link, 'screenshot_path'), *link_tuple(link, 'screenshot_path'),
*link_tuple(link, 'dom_path'), *link_tuple(link, 'dom_path'),
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

View file

@ -25,6 +25,7 @@ from ..logging_util import (
from .title import should_save_title, save_title from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .pdf import should_save_pdf, save_pdf from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom from .dom import should_save_dom, save_dom
@ -37,6 +38,7 @@ def get_default_archive_methods():
('title', should_save_title, save_title), ('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon), ('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget), ('wget', should_save_wget, save_wget),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf), ('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot), ('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom), ('dom', should_save_dom, save_dom),

View file

@ -0,0 +1,84 @@
__package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
chrome_args
)
from ..config import (
TIMEOUT,
SAVE_SINGLEFILE,
SINGLEFILE_BINARY,
SINGLEFILE_VERSION,
CHROME_BINARY,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
output = Path(out_dir or link.link_dir) / 'singlefile.html'
return SAVE_SINGLEFILE and (not output.exists())
@enforce_types
def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using single-file"""
out_dir = out_dir or link.link_dir
output = str(Path(out_dir).absolute() / "singlefile.html")
browser_args = chrome_args(TIMEOUT=0)
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
cmd = [
SINGLEFILE_BINARY,
'--browser-executable-path={}'.format(CHROME_BINARY),
'--browser-args="{}"'.format(json.dumps(browser_args[1:])),
link.url,
output
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
hints = (
'Got single-file response code: {}.'.format(result.returncode),
*output_tail,
)
# Check for common failure cases
if (result.returncode > 0):
raise ArchiveError('SingleFile was not able to archive the page', hints)
chmod_file(output)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=SINGLEFILE_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -365,6 +365,7 @@ class Link:
'screenshot.png', 'screenshot.png',
'output.html', 'output.html',
'media', 'media',
'singlefile.html'
) )
return any( return any(
@ -376,7 +377,7 @@ class Link:
"""get the latest output that each archive method produced for link""" """get the latest output that each archive method produced for link"""
ARCHIVE_METHODS = ( ARCHIVE_METHODS = (
'title', 'favicon', 'wget', 'warc', 'pdf', 'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
'screenshot', 'dom', 'git', 'media', 'archive_org', 'screenshot', 'dom', 'git', 'media', 'archive_org',
) )
latest: Dict[str, ArchiveOutput] = {} latest: Dict[str, ArchiveOutput] = {}
@ -392,7 +393,6 @@ class Link:
latest[archive_method] = history[0].output latest[archive_method] = history[0].output
else: else:
latest[archive_method] = None latest[archive_method] = None
return latest return latest
@ -406,6 +406,7 @@ class Link:
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'wget_path': wget_output_path(self), 'wget_path': wget_output_path(self),
'warc_path': 'warc', 'warc_path': 'warc',
'singlefile_path': 'singlefile.html',
'pdf_path': 'output.pdf', 'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png', 'screenshot_path': 'screenshot.png',
'dom_path': 'output.html', 'dom_path': 'output.html',
@ -425,7 +426,7 @@ class Link:
'pdf_path': static_path, 'pdf_path': static_path,
'screenshot_path': static_path, 'screenshot_path': static_path,
'dom_path': static_path, 'dom_path': static_path,
'singlefile_path': static_path,
}) })
return canonical return canonical

View file

@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
@enforce_types @enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str: def printable_dependency_version(name: str, dependency: Dict) -> str:
version = None
if dependency['enabled']: if dependency['enabled']:
if dependency['is_valid']: if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', '' color, symbol, note, version = 'green', '', 'valid', ''

View file

@ -79,6 +79,7 @@
.card { .card {
overflow: hidden; overflow: hidden;
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02); box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
margin-top: 10px;
} }
.card h4 { .card h4 {
font-size: 1.4vw; font-size: 1.4vw;
@ -335,6 +336,18 @@
</div> </div>
</div> </div>
</div> </div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
<p class="card-text">archive/singlefile.html</p>
</div>
</div>
</div>
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card"> <div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe> <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>

View file

@ -7,4 +7,19 @@ import pytest
def process(tmp_path): def process(tmp_path):
os.chdir(tmp_path) os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'init'], capture_output=True) process = subprocess.run(['archivebox', 'init'], capture_output=True)
return process return process
@pytest.fixture
def disable_extractors_dict():
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"USE_GIT": "false",
"SAVE_MEDIA": "false",
"SAVE_ARCHIVE_DOT_ORG": "false"
})
return env

View file

@ -3,25 +3,30 @@ import json
from .fixtures import * from .fixtures import *
def test_depth_flag_is_accepted(process): def test_depth_flag_is_accepted(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
def test_depth_flag_fails_if_it_is_not_0_or_1(process): def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True) arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
capture_output=True, env=disable_extractors_dict)
assert 'invalid choice' in arg_process.stderr.decode("utf-8") assert 'invalid choice' in arg_process.stderr.decode("utf-8")
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True) arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
capture_output=True, env=disable_extractors_dict)
assert 'invalid choice' in arg_process.stderr.decode("utf-8") assert 'invalid choice' in arg_process.stderr.decode("utf-8")
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
with open(archived_item_path / "index.json", "r") as f: with open(archived_item_path / "index.json", "r") as f:
output_json = json.load(f) output_json = json.load(f)
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True) arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
capture_output=True, env=disable_extractors_dict)
with open(tmp_path / "index.json", "r") as f: with open(tmp_path / "index.json", "r") as f:
archive_file = f.read() archive_file = f.read()
assert "http://127.0.0.1:8080/static/example.com.html" in archive_file assert "http://127.0.0.1:8080/static/example.com.html" in archive_file

View file

@ -1,8 +1,10 @@
from .fixtures import * from .fixtures import *
from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
def test_wget_broken_pipe(tmp_path, process): def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) disable_extractors_dict.update({"USE_WGET": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
def test_ignore_methods(): def test_ignore_methods():
@ -10,4 +12,12 @@ def test_ignore_methods():
Takes the passed method out of the default methods list and returns that value Takes the passed method out of the default methods list and returns that value
""" """
ignored = ignore_methods(['title']) ignored = ignore_methods(['title'])
assert should_save_title not in ignored assert should_save_title not in ignored
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html"
assert output_file.exists()

View file

@ -18,9 +18,10 @@ def test_update(tmp_path, process):
update_process = subprocess.run(['archivebox', 'init'], capture_output=True) update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
def test_add_link(tmp_path, process): def test_add_link(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
assert "index.json" in [x.name for x in archived_item_path.iterdir()] assert "index.json" in [x.name for x in archived_item_path.iterdir()]
@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
output_html = f.read() output_html = f.read()
assert "Example Domain" in output_html assert "Example Domain" in output_html
def test_add_link_support_stdin(tmp_path, process): def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
env=disable_extractors_dict)
stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode()) stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
file_path = tmp_path / file file_path = tmp_path / file
assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
def test_correct_permissions_add_command_results(tmp_path, process): def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
for path in archived_item_path.iterdir(): for path in archived_item_path.iterdir():
assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS

View file

@ -2,13 +2,14 @@ from pathlib import Path
from .fixtures import * from .fixtures import *
def test_oneshot_command_exists(tmp_path): def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'oneshot'], capture_output=True) process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8") assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")
def test_oneshot_commad_saves_page_in_right_folder(tmp_path): def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True) process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
capture_output=True, env=disable_extractors_dict)
items = ' '.join([str(x) for x in tmp_path.iterdir()]) items = ' '.join([str(x) for x in tmp_path.iterdir()])
current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
assert "index.json" in items assert "index.json" in items

View file

@ -1,8 +1,8 @@
from .fixtures import * from .fixtures import *
def test_remove_leaves_index_in_consistent_state(tmp_path, process): def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
list_process = subprocess.run(['archivebox', 'list'], capture_output=True) list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8") assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")

View file

@ -1,12 +1,13 @@
from .fixtures import * from .fixtures import *
def test_title_is_htmlencoded_in_index_html(tmp_path, process): def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
""" """
https://github.com/pirate/ArchiveBox/issues/330 https://github.com/pirate/ArchiveBox/issues/330
Unencoded content should not be rendered as it facilitates xss injections Unencoded content should not be rendered as it facilitates xss injections
and breaks the layout. and breaks the layout.
""" """
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True) add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
capture_output=True, env=disable_extractors_dict)
with open(tmp_path / "index.html", "r") as f: with open(tmp_path / "index.html", "r") as f:
output_html = f.read() output_html = f.read()