From b8585dd92e3ffcb11d32e099f8e796fd90a5f22d Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 20 Aug 2020 09:18:25 -0500 Subject: [PATCH] feat: load_main_index returns a queryset now --- archivebox/index/__init__.py | 10 ++++------ archivebox/main.py | 31 ++++++++++--------------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 022c83b4..9c78053b 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -17,6 +17,7 @@ from ..util import ( ExtendedEncoder, ) from ..config import ( + setup_django, ARCHIVE_DIR_NAME, SQL_INDEX_FILENAME, JSON_INDEX_FILENAME, @@ -263,17 +264,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool= @enforce_types def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" - - all_links: List[Link] = [] + setup_django(out_dir, check_db=True) + from core.models import Snapshot try: - all_links = list(parse_sql_main_index(out_dir)) - list(parse_sql_main_index(out_dir)) + return Snapshot.objects.all() except (KeyboardInterrupt, SystemExit): raise SystemExit(0) - return all_links - @enforce_types def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) diff --git a/archivebox/main.py b/archivebox/main.py index 5b699af0..aa6b97de 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -322,7 +322,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: if existing_index: all_links = { link.url: link - for link in load_main_index(out_dir=out_dir, warn=False) + for link in [x.as_link for x in load_main_index(out_dir=out_dir, warn=False)] } print(' √ Loaded {} links from existing main index.'.format(len(all_links))) @@ -402,21 +402,11 @@ def status(out_dir: str=OUTPUT_DIR) -> None: print(f' Index size: {size} across {num_files} files') print() - links = list(load_main_index(out_dir=out_dir)) - num_sql_links = len(links) - num_json_links = sum(1 for link in parse_json_main_index(out_dir=out_dir)) - num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir)) + links = load_main_index(out_dir=out_dir) + num_sql_links = links.count() num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - print(f' > JSON Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})') - print(f' > HTML Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') - - if num_html_links != len(links) or num_json_links != len(links): - print() - print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI)) - print(' archivebox init') - print() print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) @@ -479,7 +469,7 @@ def status(out_dir: str=OUTPUT_DIR) -> None: print(' archivebox manage createsuperuser') print() - for snapshot in Snapshot.objects.order_by('-updated')[:10]: + for snapshot in links.order_by('-updated')[:10]: if not snapshot.updated: continue print( @@ -529,9 +519,8 @@ def add(urls: Union[str, List[str]], # Load list of links from the existing index check_data_folder(out_dir=out_dir) check_dependencies() - all_links: List[Link] = [] new_links: List[Link] = [] - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] log_importing_started(urls=urls, depth=depth, index_only=index_only) if isinstance(urls, str): @@ -570,7 +559,7 @@ def add(urls: Union[str, List[str]], return all_links # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links @@ -635,7 +624,7 @@ def remove(filter_str: Optional[str]=None, try: to_keep = [] to_delete = [] - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] for link in all_links: should_remove = ( (after is not None and float(link.timestamp) < after) @@ -679,7 +668,7 @@ def update(resume: Optional[float]=None, # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path new_links: List[Link] = [] - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] # Step 2: Write updated index with deduped old and new links back to disk # write_main_index(links=list(all_links), out_dir=out_dir) @@ -716,7 +705,7 @@ def update(resume: Optional[float]=None, archive_links(to_archive, overwrite=overwrite, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links @@ -777,7 +766,7 @@ def list_links(filter_patterns: Optional[List[str]]=None, check_data_folder(out_dir=out_dir) - all_links = load_main_index(out_dir=out_dir) + all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] for link in all_links: if after is not None and float(link.timestamp) < after: