diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 99894e16..64829bd0 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -236,7 +236,7 @@ def timed_index_update(out_path: str): @enforce_types -def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: +def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False, write_static: bool=False) -> None: """create index.html file for a given list of links""" log_indexing_process_started(len(links)) @@ -246,11 +246,12 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool= write_sql_main_index(links, out_dir=out_dir) os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes - with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): - write_json_main_index(links, out_dir=out_dir) + if write_static: + with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): + write_json_main_index(links, out_dir=out_dir) - with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): - write_html_main_index(links, out_dir=out_dir, finished=finished) + with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): + write_html_main_index(links, out_dir=out_dir, finished=finished) except (KeyboardInterrupt, SystemExit): stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') @@ -268,26 +269,9 @@ def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]: all_links: List[Link] = [] try: - all_links = list(parse_json_main_index(out_dir)) - links_from_sql = list(parse_sql_main_index(out_dir)) + all_links = list(parse_sql_main_index(out_dir)) + list(parse_sql_main_index(out_dir)) - json_urls = set(l.url for l in all_links) - sql_urls = set(l.url for l in links_from_sql) - only_in_sql = sql_urls - json_urls - only_in_json = json_urls - sql_urls - - if only_in_json: - stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI)) - if only_in_json: - stderr(' > Only in JSON: {}...'.format(', '.join(list(only_in_json)[:5]))) - if only_in_sql: - stderr(' > Only in SQL: {}...'.format(', '.join(list(only_in_sql)[:5]))) - - stderr(' To repair the index and re-import any orphaned links run:') - stderr(' archivebox init') - if only_in_sql: - # meh, this harmless, it'll get overwritten on next run anyway - pass except (KeyboardInterrupt, SystemExit): raise SystemExit(0) diff --git a/archivebox/main.py b/archivebox/main.py index 3958405c..314184ca 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import os import sys import shutil +from pathlib import Path from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices @@ -252,7 +253,8 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" os.makedirs(out_dir, exist_ok=True) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) - existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)) + + existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists() if is_empty and not existing_index: print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) @@ -264,11 +266,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) else: if force: - stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow') + stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow') stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).') else: stderr( - ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n" + ("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n" " You must run init in a completely empty directory, or an existing data folder.\n\n" " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" " then run and run 'archivebox init' to pick up where you left off.\n\n" @@ -342,16 +344,6 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: all_links.update(orphaned_json_links) print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - # Links in SQL index but not in main index - orphaned_sql_links = { - link.url: link - for link in parse_sql_main_index(out_dir) - if link.url not in all_links - } - if orphaned_sql_links: - all_links.update(orphaned_sql_links) - print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI)) - # Links in data dir indexes but not in main index orphaned_data_dir_links = { link.url: link @@ -376,7 +368,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: print(' archivebox list --status=invalid') - write_main_index(list(all_links.values()), out_dir=out_dir) + write_main_index(list(all_links.values()), out_dir=out_dir, write_static=True) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) if existing_index: