From fa5de72f9f9b371d0604e7721a7431414819b574 Mon Sep 17 00:00:00 2001 From: Cristian Date: Sat, 28 Nov 2020 12:28:39 -0500 Subject: [PATCH] refactor: Move indexing logic out of logging module --- archivebox/index/__init__.py | 8 ++++---- archivebox/index/html.py | 7 +++++++ archivebox/index/json.py | 15 ++++++++++++++- archivebox/logging_util.py | 29 ----------------------------- archivebox/main.py | 18 +++++++++++++++--- 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 9e460dc7..97eeb6a2 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -383,7 +383,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in links @@ -391,7 +391,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_archived, links) @@ -399,7 +399,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_unarchived, links) @@ -424,7 +424,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_valid, links) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index b793cea3..207f2fd3 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -50,6 +50,13 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: return () +def generate_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = main_index_template(links, True) + else: + output = main_index_template(links, True, MINIMAL_INDEX_TEMPLATE) + return output + @enforce_types def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: """render the template for the entire main index""" diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 1c3ce6e8..6422a93e 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -39,7 +39,20 @@ MAIN_INDEX_HEADER = { }, } -### Main Links Index +@enforce_types +def generate_json_index_from_links(links: List[Link], with_headers: bool): + if with_headers: + output = { + **MAIN_INDEX_HEADER, + 'num_links': len(links), + 'updated': datetime.now(), + 'last_run_cmd': sys.argv, + 'links': links, + } + else: + output = links + return to_json(output, indent=4, sort_keys=True) + @enforce_types def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 8648e0a4..5581efdb 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -477,39 +477,10 @@ def printable_filesize(num_bytes: Union[int, float]) -> str: @enforce_types def printable_folders(folders: Dict[str, Optional["Link"]], - json: bool=False, - html: bool=False, - csv: Optional[str]=None, with_headers: bool=False) -> str: - from .index.json import MAIN_INDEX_HEADER links = folders.values() - if json: - from .index.json import to_json - if with_headers: - output = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(), - 'last_run_cmd': sys.argv, - 'links': links, - } - else: - output = links - return to_json(output, indent=4, sort_keys=True) - elif html: - from .index.html import main_index_template - if with_headers: - output = main_index_template(links) - else: - from .index.html import MINIMAL_INDEX_TEMPLATE - output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) - return output - elif csv: - from .index.csv import links_to_csv - return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) - return '\n'.join( f'{folder} {link and link.url} "{link and link.title}"' for folder, link in folders.items() diff --git a/archivebox/main.py b/archivebox/main.py index 94658a8f..db589841 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -49,12 +49,16 @@ from .index import ( from .index.json import ( parse_json_main_index, parse_json_links_details, + generate_json_index_from_links, ) from .index.sql import ( get_admins, apply_migrations, remove_from_sql_main_index, ) +from .index.html import ( + generate_index_from_links, +) from .extractors import archive_links, archive_link, ignore_methods from .config import ( stderr, @@ -745,7 +749,6 @@ def list_all(filter_patterns_str: Optional[str]=None, elif filter_patterns_str: filter_patterns = filter_patterns_str.split('\n') - snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, @@ -761,8 +764,17 @@ def list_all(filter_patterns_str: Optional[str]=None, status=status, out_dir=out_dir, ) - - print(printable_folders(folders, json=json, csv=csv, html=html, with_headers=with_headers)) + + if json: + output = generate_json_index_from_links(folders.values(), with_headers) + elif html: + output = generate_index_from_links(folders.values(), with_headers) + elif csv: + from .index.csv import links_to_csv + output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) + else: + output = printable_folders(folders, with_headers=with_headers) + print(output) return folders