From 275ad22db706fd44366cb88f6b1eef805c81c6d1 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 8 Dec 2020 18:42:01 -0500 Subject: [PATCH] refactor: Remove `skip_index` from archive related functions --- archivebox/cli/__init__.py | 8 +++++--- archivebox/extractors/__init__.py | 22 ++++++++++------------ archivebox/main.py | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 9cf6d0ac..3df41809 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -19,7 +19,7 @@ meta_cmds = ('help', 'version') main_cmds = ('init', 'info', 'config') archive_cmds = ('add', 'remove', 'update', 'list', 'status') -fake_db = ("oneshot",) + meta_cmds +fake_db = ("oneshot",) display_first = (*meta_cmds, *main_cmds, *archive_cmds) @@ -60,8 +60,10 @@ def run_subcommand(subcommand: str, stdin: Optional[IO]=None, pwd: Union[Path, str, None]=None) -> None: """Run a given ArchiveBox subcommand with the given list of args""" - from ..config import setup_django - setup_django(in_memory_db=subcommand in fake_db) + + if subcommand not in meta_cmds: + from ..config import setup_django + setup_django(in_memory_db=subcommand in fake_db) module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 6db89f2b..a4acef0b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -67,16 +67,15 @@ def ignore_methods(to_ignore: List[str]): return list(methods) @enforce_types -def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link: +def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. - if not skip_index: - from core.models import Snapshot, ArchiveResult - try: - snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot - except Snapshot.DoesNotExist: - snapshot = write_link_to_sql_index(link) + from core.models import Snapshot, ArchiveResult + try: + snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot + except Snapshot.DoesNotExist: + snapshot = write_link_to_sql_index(link) ARCHIVE_METHODS = get_default_archive_methods() @@ -93,7 +92,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s os.makedirs(out_dir) link = load_link_details(link, out_dir=out_dir) - write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index) + write_link_details(link, out_dir=out_dir, skip_sql_index=False) log_link_archiving_started(link, out_dir, is_new) link = link.overwrite(updated=datetime.now()) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} @@ -112,9 +111,8 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) - if not skip_index: - write_search_index(link=link, texts=result.index_texts) - ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, + write_search_index(link=link, texts=result.index_texts) + ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) else: @@ -135,7 +133,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s except Exception: pass - write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index) + write_link_details(link, out_dir=out_dir, skip_sql_index=False) log_link_archiving_finished(link, link.link_dir, is_new, stats) diff --git a/archivebox/main.py b/archivebox/main.py index 49c31eed..6463bab6 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -524,7 +524,7 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR): ) raise SystemExit(2) methods = ignore_methods(['title']) - archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, skip_index=False) + archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) return oneshot_link @enforce_types