mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
allow cancelling init partway through and show hints to resume
This commit is contained in:
parent
75e1bfd0a9
commit
ba2afa0672
2 changed files with 62 additions and 55 deletions
|
@ -285,11 +285,11 @@ def log_indexing_process_finished():
|
||||||
|
|
||||||
def log_indexing_started(out_path: str):
|
def log_indexing_started(out_path: str):
|
||||||
if IS_TTY:
|
if IS_TTY:
|
||||||
sys.stdout.write(f' > {out_path}')
|
sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
||||||
|
|
||||||
|
|
||||||
def log_indexing_finished(out_path: str):
|
def log_indexing_finished(out_path: str):
|
||||||
print(f'\r √ {out_path}')
|
print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
||||||
|
|
||||||
|
|
||||||
### Archiving Stage
|
### Archiving Stage
|
||||||
|
|
|
@ -298,13 +298,12 @@ def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None
|
||||||
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
|
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
|
||||||
|
|
||||||
if is_empty and not existing_index:
|
if is_empty and not existing_index:
|
||||||
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
|
||||||
print(f' {out_dir}')
|
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
|
||||||
elif existing_index:
|
elif existing_index:
|
||||||
print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
# TODO: properly detect and print the existing version in current index as well
|
||||||
print(f' {out_dir}')
|
print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
|
||||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
if force:
|
if force:
|
||||||
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
|
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
|
||||||
|
@ -325,26 +324,25 @@ def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None
|
||||||
else:
|
else:
|
||||||
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
print(f' + {ARCHIVE_DIR}, {SOURCES_DIR}, {LOGS_DIR}...')
|
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
|
||||||
Path(SOURCES_DIR).mkdir(exist_ok=True)
|
Path(SOURCES_DIR).mkdir(exist_ok=True)
|
||||||
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
|
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
Path(LOGS_DIR).mkdir(exist_ok=True)
|
||||||
print(f' + {CONFIG_FILE}...')
|
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
|
||||||
write_config_file({}, out_dir=out_dir)
|
write_config_file({}, out_dir=out_dir)
|
||||||
|
|
||||||
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
|
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
|
||||||
print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
|
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
|
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
|
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
|
||||||
print(f' √ {DATABASE_FILE}')
|
|
||||||
print()
|
|
||||||
for migration_line in apply_migrations(out_dir):
|
for migration_line in apply_migrations(out_dir):
|
||||||
print(f' {migration_line}')
|
print(f' {migration_line}')
|
||||||
|
|
||||||
|
|
||||||
assert DATABASE_FILE.exists()
|
assert DATABASE_FILE.exists()
|
||||||
|
print()
|
||||||
|
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
|
||||||
|
|
||||||
# from django.contrib.auth.models import User
|
# from django.contrib.auth.models import User
|
||||||
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||||
|
@ -352,7 +350,7 @@ def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None
|
||||||
# call_command("createsuperuser", interactive=True)
|
# call_command("createsuperuser", interactive=True)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{green}[*] Checking links from indexes and archive folders...{reset}'.format(**ANSI))
|
print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
all_links = Snapshot.objects.none()
|
all_links = Snapshot.objects.none()
|
||||||
pending_links: Dict[str, Link] = {}
|
pending_links: Dict[str, Link] = {}
|
||||||
|
@ -364,56 +362,65 @@ def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None
|
||||||
if quick:
|
if quick:
|
||||||
print(' > Skipping full snapshot directory check (quick mode)')
|
print(' > Skipping full snapshot directory check (quick mode)')
|
||||||
else:
|
else:
|
||||||
# Links in data folders that dont match their timestamp
|
try:
|
||||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
# Links in data folders that dont match their timestamp
|
||||||
if fixed:
|
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||||
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
|
if fixed:
|
||||||
if cant_fix:
|
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
|
||||||
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
|
if cant_fix:
|
||||||
|
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
|
||||||
|
|
||||||
# Links in JSON index but not in main index
|
# Links in JSON index but not in main index
|
||||||
orphaned_json_links = {
|
orphaned_json_links = {
|
||||||
link.url: link
|
link.url: link
|
||||||
for link in parse_json_main_index(out_dir)
|
for link in parse_json_main_index(out_dir)
|
||||||
if not all_links.filter(url=link.url).exists()
|
if not all_links.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_json_links:
|
if orphaned_json_links:
|
||||||
pending_links.update(orphaned_json_links)
|
pending_links.update(orphaned_json_links)
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||||
|
|
||||||
# Links in data dir indexes but not in main index
|
# Links in data dir indexes but not in main index
|
||||||
orphaned_data_dir_links = {
|
orphaned_data_dir_links = {
|
||||||
link.url: link
|
link.url: link
|
||||||
for link in parse_json_links_details(out_dir)
|
for link in parse_json_links_details(out_dir)
|
||||||
if not all_links.filter(url=link.url).exists()
|
if not all_links.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_data_dir_links:
|
if orphaned_data_dir_links:
|
||||||
pending_links.update(orphaned_data_dir_links)
|
pending_links.update(orphaned_data_dir_links)
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
||||||
|
|
||||||
# Links in invalid/duplicate data dirs
|
# Links in invalid/duplicate data dirs
|
||||||
invalid_folders = {
|
invalid_folders = {
|
||||||
folder: link
|
folder: link
|
||||||
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
||||||
}
|
}
|
||||||
if invalid_folders:
|
if invalid_folders:
|
||||||
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||||
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
||||||
print(' archivebox status')
|
print(' archivebox status')
|
||||||
print(' archivebox list --status=invalid')
|
print(' archivebox list --status=invalid')
|
||||||
|
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
stderr()
|
||||||
|
stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
|
||||||
|
stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
|
||||||
|
stderr()
|
||||||
|
stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
|
||||||
|
stderr(' archivebox init --quick')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||||
|
|
||||||
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
if existing_index:
|
if existing_index:
|
||||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
|
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
|
||||||
|
|
||||||
if Snapshot.objects.count() < 20: # hide the hints for experienced users
|
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
||||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue