From 44eede96e5b75d32758c68a11370ff2a0857b103 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 13 Nov 2020 09:24:34 -0500 Subject: [PATCH] feat: Add extract flag to add command --- archivebox/cli/archivebox_add.py | 9 ++++++++- archivebox/main.py | 14 ++++++++++---- tests/test_add.py | 11 ++++++++++- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5c370fa5..8d2d2af2 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -62,10 +62,16 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Re-archive URLs from scratch, overwriting any existing files" ) parser.add_argument( - '--init', #'-i', + "--init", #'-i', action='store_true', help="Init/upgrade the curent data directory before adding", ) + parser.add_argument( + "--extract", + nargs="+", + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration" + ) command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) @@ -83,6 +89,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, + extractors = command.extract or [], ) diff --git a/archivebox/main.py b/archivebox/main.py index 44ee6b14..208f7661 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -525,7 +525,8 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR) -> List[Link]: + out_dir: Path=OUTPUT_DIR, + extractors: list=[]) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' @@ -567,12 +568,17 @@ def add(urls: Union[str, List[str]], return all_links # Run the archive methods for each link + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors if update_all: - archive_links(all_links, overwrite=overwrite, out_dir=out_dir) + archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: - archive_links(imported_links, overwrite=True, out_dir=out_dir) + archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: - archive_links(new_links, overwrite=False, out_dir=out_dir) + archive_links(new_links, overwrite=False, **archive_kwargs) return all_links diff --git a/tests/test_add.py b/tests/test_add.py index 5e672e8d..bb15e51b 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -81,4 +81,13 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["history"] != {} \ No newline at end of file + assert output_json["history"] != {} + +def test_extract_input_uses_only_passed_extractors(tmp_path, process): + subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"], + capture_output=True) + + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert (archived_item_path / "warc").exists() + assert not (archived_item_path / "singlefile.html").exists() \ No newline at end of file