Add URL-specific method allow/deny lists

Allows enabling only allow-listed extractors or disabling specific deny-listed extractors for a regular expression matched against an added site's URL.
2025-05-14 15:14:31 -04:00 · 2023-07-31 11:34:03 -04:00 · 2023-07-31 11:34:03 -04:00 · b44f7e68b1
commit b44f7e68b1
parent 46e80dd509
3 changed files with 85 additions and 16 deletions
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@ -13,12 +13,51 @@ def test_ignore_methods():
    Takes the passed method out of the default methods list and returns that value
    """
    ignored = ignore_methods(['title'])
-    assert should_save_title not in ignored
+    assert "title" not in ignored
+
+def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
+    allow_list = {
+        r'/static': ["headers", "singlefile"],
+        r'example\.com\.html$': ["headers"],
+    }
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_ALLOWLIST": pyjson.dumps(allow_list),
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()
+
+def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()

 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
-                                  capture_output=True, env=disable_extractors_dict) 
+                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
    output_file = archived_item_path / "singlefile.html" 
    assert output_file.exists()