mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
feat: Update init to take advantage of querysets to reduce memory consumption
This commit is contained in:
parent
6b4b7127b4
commit
dae606de6e
1 changed files with 12 additions and 14 deletions
|
@ -318,12 +318,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
all_links: Dict[str, Link] = {}
|
all_links: Dict[str, Link] = {}
|
||||||
|
pending_links: Dict[str, Link] = {}
|
||||||
|
|
||||||
if existing_index:
|
if existing_index:
|
||||||
all_links = {
|
all_links = load_main_index(out_dir=out_dir, warn=False)
|
||||||
link.url: link
|
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
|
||||||
for link in [x.as_link for x in load_main_index(out_dir=out_dir, warn=False)]
|
|
||||||
}
|
|
||||||
print(' √ Loaded {} links from existing main index.'.format(len(all_links)))
|
|
||||||
|
|
||||||
# Links in data folders that dont match their timestamp
|
# Links in data folders that dont match their timestamp
|
||||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||||
|
@ -336,27 +335,26 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
orphaned_json_links = {
|
orphaned_json_links = {
|
||||||
link.url: link
|
link.url: link
|
||||||
for link in parse_json_main_index(out_dir)
|
for link in parse_json_main_index(out_dir)
|
||||||
if link.url not in all_links
|
if not all_links.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_json_links:
|
if orphaned_json_links:
|
||||||
all_links.update(orphaned_json_links)
|
pending_links.update(orphaned_json_links)
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||||
|
|
||||||
# Links in data dir indexes but not in main index
|
# Links in data dir indexes but not in main index
|
||||||
orphaned_data_dir_links = {
|
orphaned_data_dir_links = {
|
||||||
link.url: link
|
link.url: link
|
||||||
for link in parse_json_links_details(out_dir)
|
for link in parse_json_links_details(out_dir)
|
||||||
if link.url not in all_links
|
if not all_links.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_data_dir_links:
|
if orphaned_data_dir_links:
|
||||||
all_links.update(orphaned_data_dir_links)
|
pending_links.update(orphaned_data_dir_links)
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
||||||
|
|
||||||
# Links in invalid/duplicate data dirs
|
# Links in invalid/duplicate data dirs
|
||||||
invalid_folders = {
|
invalid_folders: Dict[str, Link] = {}
|
||||||
folder: link
|
for link in all_links:
|
||||||
for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
|
invalid_folders.update(get_invalid_folders([link.as_link()], out_dir=out_dir).items())
|
||||||
}
|
|
||||||
if invalid_folders:
|
if invalid_folders:
|
||||||
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||||
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
||||||
|
@ -366,7 +364,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
print(' archivebox list --status=invalid')
|
print(' archivebox list --status=invalid')
|
||||||
|
|
||||||
|
|
||||||
write_main_index(list(all_links.values()), out_dir=out_dir, finished=True)
|
write_main_index(list(pending_links.values()), out_dir=out_dir, finished=True)
|
||||||
|
|
||||||
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
if existing_index:
|
if existing_index:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue