From 9fa70b3452836cafb975cb0dbb37b52a74ab68eb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 15:48:46 +0200 Subject: [PATCH] add extractors arg to oneshot command and bump version to v0.5.1 --- archivebox/cli/archivebox_add.py | 2 +- archivebox/cli/archivebox_oneshot.py | 8 ++++++++ archivebox/main.py | 9 +++++---- package.json | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index b4e65231..41c7554d 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional index_only=command.index_only, overwrite=command.overwrite, init=command.init, - out_dir=pwd or OUTPUT_DIR, extractors=command.extract, + out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index 2353d101..af68bac2 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ' ~/Desktop/sites_list.csv\n' ) ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) parser.add_argument( '--out-dir', type=str, @@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional oneshot( url=stdin_url or url, out_dir=Path(command.out_dir).resolve(), + extractors=command.extract, ) diff --git a/archivebox/main.py b/archivebox/main.py index 6463bab6..97c13c4e 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: @enforce_types -def oneshot(url: str, out_dir: Path=OUTPUT_DIR): +def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): """ Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. You can run this to archive single pages without needing to create a whole collection with archivebox init. @@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR): color='red' ) raise SystemExit(2) - methods = ignore_methods(['title']) + + methods = extractors.split(",") if extractors else ignore_methods(['title']) archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) return oneshot_link @@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR, - extractors: str="") -> List[Link]: + extractors: str="", + out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' diff --git a/package.json b/package.json index 8d88a3fd..36545fb7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.5.0", + "version": "0.5.1", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT",