mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
Add URL-specific method allow/deny lists
Allows enabling only allow-listed extractors or disabling specific deny-listed extractors for a regular expression matched against an added site's URL.
This commit is contained in:
parent
46e80dd509
commit
b44f7e68b1
3 changed files with 85 additions and 16 deletions
|
@ -13,12 +13,51 @@ def test_ignore_methods():
|
|||
Takes the passed method out of the default methods list and returns that value
|
||||
"""
|
||||
ignored = ignore_methods(['title'])
|
||||
assert should_save_title not in ignored
|
||||
assert "title" not in ignored
|
||||
|
||||
def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
|
||||
allow_list = {
|
||||
r'/static': ["headers", "singlefile"],
|
||||
r'example\.com\.html$': ["headers"],
|
||||
}
|
||||
deny_list = {
|
||||
"/static": ["singlefile"],
|
||||
}
|
||||
disable_extractors_dict.update({
|
||||
"SAVE_HEADERS": "true",
|
||||
"USE_SINGLEFILE": "true",
|
||||
"SAVE_ALLOWLIST": pyjson.dumps(allow_list),
|
||||
"SAVE_DENYLIST": pyjson.dumps(deny_list),
|
||||
})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
singlefile_file = archived_item_path / "singlefile.html"
|
||||
assert not singlefile_file.exists()
|
||||
headers_file = archived_item_path / "headers.json"
|
||||
assert headers_file.exists()
|
||||
|
||||
def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
|
||||
deny_list = {
|
||||
"/static": ["singlefile"],
|
||||
}
|
||||
disable_extractors_dict.update({
|
||||
"SAVE_HEADERS": "true",
|
||||
"USE_SINGLEFILE": "true",
|
||||
"SAVE_DENYLIST": pyjson.dumps(deny_list),
|
||||
})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
singlefile_file = archived_item_path / "singlefile.html"
|
||||
assert not singlefile_file.exists()
|
||||
headers_file = archived_item_path / "headers.json"
|
||||
assert headers_file.exists()
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
output_file = archived_item_path / "singlefile.html"
|
||||
assert output_file.exists()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue