mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-24 11:47:04 -04:00
Merge pull request #403 from cdvv7788/single-file
This commit is contained in:
commit
c8e3aed647
16 changed files with 236 additions and 47 deletions
24
.github/workflows/test.yml
vendored
24
.github/workflows/test.yml
vendored
|
@ -40,6 +40,22 @@ jobs:
|
|||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
repository: "gildas-lormeau/SingleFile"
|
||||
ref: "master"
|
||||
path: "singlefile"
|
||||
|
||||
- name: Install npm requirements for singlefile
|
||||
run: npm install --prefix singlefile/cli
|
||||
|
||||
- name: Give singlefile execution permissions
|
||||
run: chmod +x singlefile/cli/single-file
|
||||
|
||||
- name: Set SINGLEFILE_BINARY
|
||||
run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
|
||||
|
||||
- name: Set up Python ${{ matrix.python }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
|
@ -60,6 +76,14 @@ jobs:
|
|||
restore-keys: |
|
||||
${{ runner.os }}-${{ matrix.python }}-venv-
|
||||
|
||||
- name: Use nodejs 14.7.0
|
||||
uses: actions/setup-node@v1
|
||||
with:
|
||||
node-version: 14.7.0
|
||||
|
||||
- name: Debug
|
||||
run: ls ./
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install .
|
||||
|
|
19
Dockerfile
19
Dockerfile
|
@ -22,9 +22,10 @@ ENV TZ=UTC \
|
|||
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
|
||||
CODE_PATH=/app \
|
||||
VENV_PATH=/venv \
|
||||
DATA_PATH=/data
|
||||
DATA_PATH=/data \
|
||||
EXTRA_PATH=/extra
|
||||
|
||||
# First install CLI utils and base deps, then Chrome + Fons
|
||||
# First install CLI utils and base deps, then Chrome + Fons + nodejs
|
||||
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
|
@ -32,6 +33,7 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio
|
|||
dumb-init jq git wget curl youtube-dl ffmpeg \
|
||||
&& curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
|
||||
&& echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& curl -sL https://deb.nodesource.com/setup_14.x | bash - \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
google-chrome-stable \
|
||||
|
@ -43,8 +45,18 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio
|
|||
fonts-symbola \
|
||||
fonts-noto \
|
||||
fonts-freefont-ttf \
|
||||
nodejs \
|
||||
unzip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Clone singlefile and move it to the /bin folder so archivebox can find it
|
||||
|
||||
WORKDIR "$EXTRA_PATH"
|
||||
RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
|
||||
&& unzip -q SingleFile.zip \
|
||||
&& npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
|
||||
&& chmod +x SingleFile-master/cli/single-file
|
||||
|
||||
# Run everything from here on out as non-privileged user
|
||||
RUN groupadd --system archivebox \
|
||||
&& useradd --system --create-home --gid archivebox --groups audio,video archivebox
|
||||
|
@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
|
|||
WORKDIR "$DATA_PATH"
|
||||
EXPOSE 8000
|
||||
ENV CHROME_BINARY=google-chrome \
|
||||
CHROME_SANDBOX=False
|
||||
CHROME_SANDBOX=False \
|
||||
SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"
|
||||
|
||||
RUN env ALLOW_ROOT=True archivebox version
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
|
||||
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
|
||||
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
|
||||
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
|
||||
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
||||
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
|
||||
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
||||
|
@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'DEPENDENCY_CONFIG': {
|
||||
'USE_CURL': {'type': bool, 'default': True},
|
||||
'USE_WGET': {'type': bool, 'default': True},
|
||||
'USE_SINGLEFILE': {'type': bool, 'default': True},
|
||||
'USE_GIT': {'type': bool, 'default': True},
|
||||
'USE_CHROME': {'type': bool, 'default': True},
|
||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||
|
@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
||||
'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
},
|
||||
|
@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
|||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
|
@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': config['USE_WGET'],
|
||||
'is_valid': bool(config['WGET_VERSION']),
|
||||
},
|
||||
'SINGLEFILE_BINARY': {
|
||||
'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||
'version': config['SINGLEFILE_VERSION'],
|
||||
'hash': bin_hash(config['SINGLEFILE_BINARY']),
|
||||
'enabled': config['USE_SINGLEFILE'],
|
||||
'is_valid': bool(config['SINGLEFILE_VERSION']),
|
||||
},
|
||||
'GIT_BINARY': {
|
||||
'path': bin_path(config['GIT_BINARY']),
|
||||
'version': config['GIT_VERSION'],
|
||||
|
|
|
@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="SingleFile">🗜 </a>'
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
|
||||
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
||||
|
@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
*link_tuple(link, 'screenshot_path'),
|
||||
*link_tuple(link, 'dom_path'),
|
||||
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
||||
*link_tuple(link, 'singlefile_path'),
|
||||
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
||||
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
||||
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
||||
|
|
|
@ -25,6 +25,7 @@ from ..logging_util import (
|
|||
from .title import should_save_title, save_title
|
||||
from .favicon import should_save_favicon, save_favicon
|
||||
from .wget import should_save_wget, save_wget
|
||||
from .singlefile import should_save_singlefile, save_singlefile
|
||||
from .pdf import should_save_pdf, save_pdf
|
||||
from .screenshot import should_save_screenshot, save_screenshot
|
||||
from .dom import should_save_dom, save_dom
|
||||
|
@ -37,6 +38,7 @@ def get_default_archive_methods():
|
|||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
|
|
84
archivebox/extractors/singlefile.py
Normal file
84
archivebox/extractors/singlefile.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
chrome_args
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SINGLEFILE,
|
||||
SINGLEFILE_BINARY,
|
||||
SINGLEFILE_VERSION,
|
||||
CHROME_BINARY,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
output = Path(out_dir or link.link_dir) / 'singlefile.html'
|
||||
return SAVE_SINGLEFILE and (not output.exists())
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using single-file"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output = str(Path(out_dir).absolute() / "singlefile.html")
|
||||
|
||||
browser_args = chrome_args(TIMEOUT=0)
|
||||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
cmd = [
|
||||
SINGLEFILE_BINARY,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
'--browser-args="{}"'.format(json.dumps(browser_args[1:])),
|
||||
link.url,
|
||||
output
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
'Got single-file response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0):
|
||||
raise ArchiveError('SingleFile was not able to archive the page', hints)
|
||||
chmod_file(output)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=SINGLEFILE_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
|
@ -365,6 +365,7 @@ class Link:
|
|||
'screenshot.png',
|
||||
'output.html',
|
||||
'media',
|
||||
'singlefile.html'
|
||||
)
|
||||
|
||||
return any(
|
||||
|
@ -376,7 +377,7 @@ class Link:
|
|||
"""get the latest output that each archive method produced for link"""
|
||||
|
||||
ARCHIVE_METHODS = (
|
||||
'title', 'favicon', 'wget', 'warc', 'pdf',
|
||||
'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
|
||||
'screenshot', 'dom', 'git', 'media', 'archive_org',
|
||||
)
|
||||
latest: Dict[str, ArchiveOutput] = {}
|
||||
|
@ -392,7 +393,6 @@ class Link:
|
|||
latest[archive_method] = history[0].output
|
||||
else:
|
||||
latest[archive_method] = None
|
||||
|
||||
return latest
|
||||
|
||||
|
||||
|
@ -406,6 +406,7 @@ class Link:
|
|||
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
||||
'wget_path': wget_output_path(self),
|
||||
'warc_path': 'warc',
|
||||
'singlefile_path': 'singlefile.html',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
|
@ -425,7 +426,7 @@ class Link:
|
|||
'pdf_path': static_path,
|
||||
'screenshot_path': static_path,
|
||||
'dom_path': static_path,
|
||||
'singlefile_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
|
||||
|
|
|
@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
|||
|
||||
@enforce_types
|
||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
version = None
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
|
|
|
@ -79,6 +79,7 @@
|
|||
.card {
|
||||
overflow: hidden;
|
||||
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
|
||||
margin-top: 10px;
|
||||
}
|
||||
.card h4 {
|
||||
font-size: 1.4vw;
|
||||
|
@ -335,6 +336,18 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
|
||||
<p class="card-text">archive/singlefile.html</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
|
||||
|
|
|
@ -8,3 +8,18 @@ def process(tmp_path):
|
|||
os.chdir(tmp_path)
|
||||
process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
return process
|
||||
|
||||
@pytest.fixture
|
||||
def disable_extractors_dict():
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_MEDIA": "false",
|
||||
"SAVE_ARCHIVE_DOT_ORG": "false"
|
||||
})
|
||||
return env
|
|
@ -3,25 +3,30 @@ import json
|
|||
|
||||
from .fixtures import *
|
||||
|
||||
def test_depth_flag_is_accepted(process):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
|
||||
def test_depth_flag_is_accepted(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
|
||||
|
||||
def test_depth_flag_fails_if_it_is_not_0_or_1(process):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True)
|
||||
def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True)
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
|
||||
|
||||
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
|
||||
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
with open(archived_item_path / "index.json", "r") as f:
|
||||
output_json = json.load(f)
|
||||
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
|
||||
|
||||
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True)
|
||||
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
with open(tmp_path / "index.json", "r") as f:
|
||||
archive_file = f.read()
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in archive_file
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from .fixtures import *
|
||||
from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
|
||||
|
||||
def test_wget_broken_pipe(tmp_path, process):
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
|
||||
def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_WGET": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
|
||||
|
||||
def test_ignore_methods():
|
||||
|
@ -11,3 +13,11 @@ def test_ignore_methods():
|
|||
"""
|
||||
ignored = ignore_methods(['title'])
|
||||
assert should_save_title not in ignored
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
output_file = archived_item_path / "singlefile.html"
|
||||
assert output_file.exists()
|
||||
|
|
|
@ -18,9 +18,10 @@ def test_update(tmp_path, process):
|
|||
update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
|
||||
|
||||
def test_add_link(tmp_path, process):
|
||||
def test_add_link(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
|
||||
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
|
||||
|
@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
|
|||
output_html = f.read()
|
||||
assert "Example Domain" in output_html
|
||||
|
||||
def test_add_link_support_stdin(tmp_path, process):
|
||||
def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
env=disable_extractors_dict)
|
||||
stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
|
||||
|
@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
|
|||
file_path = tmp_path / file
|
||||
assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
|
||||
|
||||
def test_correct_permissions_add_command_results(tmp_path, process):
|
||||
def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
for path in archived_item_path.iterdir():
|
||||
assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
|
||||
|
|
|
@ -2,13 +2,14 @@ from pathlib import Path
|
|||
|
||||
from .fixtures import *
|
||||
|
||||
def test_oneshot_command_exists(tmp_path):
|
||||
def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
process = subprocess.run(['archivebox', 'oneshot'], capture_output=True)
|
||||
process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
|
||||
assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")
|
||||
|
||||
def test_oneshot_commad_saves_page_in_right_folder(tmp_path):
|
||||
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True)
|
||||
def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
|
||||
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
items = ' '.join([str(x) for x in tmp_path.iterdir()])
|
||||
current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
|
||||
assert "index.json" in items
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from .fixtures import *
|
||||
|
||||
def test_remove_leaves_index_in_consistent_state(tmp_path, process):
|
||||
def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
|
||||
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
||||
remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
|
||||
list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
|
||||
assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")
|
|
@ -1,12 +1,13 @@
|
|||
from .fixtures import *
|
||||
|
||||
def test_title_is_htmlencoded_in_index_html(tmp_path, process):
|
||||
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
|
||||
"""
|
||||
https://github.com/pirate/ArchiveBox/issues/330
|
||||
Unencoded content should not be rendered as it facilitates xss injections
|
||||
and breaks the layout.
|
||||
"""
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True)
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
with open(tmp_path / "index.html", "r") as f:
|
||||
output_html = f.read()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue