feat: Replace index.json with index.sql as the main index in init

This commit is contained in:
Cristian 2020-08-18 11:45:27 -05:00 committed by Cristian Vargas
parent a6b9b04297
commit 02f36b2096
2 changed files with 14 additions and 38 deletions

View file

@ -236,7 +236,7 @@ def timed_index_update(out_path: str):
@enforce_types @enforce_types
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False, write_static: bool=False) -> None:
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_process_started(len(links)) log_indexing_process_started(len(links))
@ -246,6 +246,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
write_sql_main_index(links, out_dir=out_dir) write_sql_main_index(links, out_dir=out_dir)
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
if write_static:
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
write_json_main_index(links, out_dir=out_dir) write_json_main_index(links, out_dir=out_dir)
@ -268,26 +269,9 @@ def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
all_links: List[Link] = [] all_links: List[Link] = []
try: try:
all_links = list(parse_json_main_index(out_dir)) all_links = list(parse_sql_main_index(out_dir))
links_from_sql = list(parse_sql_main_index(out_dir)) list(parse_sql_main_index(out_dir))
json_urls = set(l.url for l in all_links)
sql_urls = set(l.url for l in links_from_sql)
only_in_sql = sql_urls - json_urls
only_in_json = json_urls - sql_urls
if only_in_json:
stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
if only_in_json:
stderr(' > Only in JSON: {}...'.format(', '.join(list(only_in_json)[:5])))
if only_in_sql:
stderr(' > Only in SQL: {}...'.format(', '.join(list(only_in_sql)[:5])))
stderr(' To repair the index and re-import any orphaned links run:')
stderr(' archivebox init')
if only_in_sql:
# meh, this harmless, it'll get overwritten on next run anyway
pass
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
raise SystemExit(0) raise SystemExit(0)

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import os import os
import sys import sys
import shutil import shutil
from pathlib import Path
from typing import Dict, List, Optional, Iterable, IO, Union from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices from crontab import CronTab, CronSlices
@ -252,7 +253,8 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory""" """Initialize a new ArchiveBox collection in the current directory"""
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
if is_empty and not existing_index: if is_empty and not existing_index:
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI)) print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
@ -264,11 +266,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI)) print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
else: else:
if force: if force:
stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow') stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).') stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
else: else:
stderr( stderr(
("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n" ("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n"
" You must run init in a completely empty directory, or an existing data folder.\n\n" " You must run init in a completely empty directory, or an existing data folder.\n\n"
" {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n" " then run and run 'archivebox init' to pick up where you left off.\n\n"
@ -342,16 +344,6 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
all_links.update(orphaned_json_links) all_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
# Links in SQL index but not in main index
orphaned_sql_links = {
link.url: link
for link in parse_sql_main_index(out_dir)
if link.url not in all_links
}
if orphaned_sql_links:
all_links.update(orphaned_sql_links)
print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
# Links in data dir indexes but not in main index # Links in data dir indexes but not in main index
orphaned_data_dir_links = { orphaned_data_dir_links = {
link.url: link link.url: link
@ -376,7 +368,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
print(' archivebox list --status=invalid') print(' archivebox list --status=invalid')
write_main_index(list(all_links.values()), out_dir=out_dir) write_main_index(list(all_links.values()), out_dir=out_dir, write_static=True)
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI)) print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index: if existing_index: