mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 16:14:28 -04:00
refactor: Get archivebox init to run
This commit is contained in:
parent
005c8a60c0
commit
e0e65bf4b1
3 changed files with 18 additions and 34 deletions
|
@ -249,7 +249,6 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||||
"""
|
"""
|
||||||
Returns all of the snapshots currently in index
|
Returns all of the snapshots currently in index
|
||||||
"""
|
"""
|
||||||
setup_django(out_dir, check_db=True)
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
try:
|
try:
|
||||||
return Snapshot.objects.all()
|
return Snapshot.objects.all()
|
||||||
|
|
|
@ -92,20 +92,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
|
def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
|
||||||
"""load the json link index from a given directory"""
|
|
||||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
|
||||||
if existing_index.exists():
|
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
|
||||||
try:
|
|
||||||
link_json = pyjson.load(f)
|
|
||||||
return Link.from_json(link_json, guess)
|
|
||||||
except pyjson.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def load_snapshot_details(snapshot: Model, out_dir: Path):
|
|
||||||
"""
|
"""
|
||||||
Loads the detail from the local json index
|
Loads the detail from the local json index
|
||||||
"""
|
"""
|
||||||
|
@ -119,20 +106,19 @@ def load_snapshot_details(snapshot: Model, out_dir: Path):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
|
||||||
"""read through all the archive data folders and return the parsed links"""
|
"""read through all the archive data folders and return the parsed links"""
|
||||||
|
|
||||||
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir(follow_symlinks=True):
|
||||||
if (Path(entry.path) / 'index.json').exists():
|
if (Path(entry.path) / 'index.json').exists():
|
||||||
try:
|
try:
|
||||||
link = parse_json_snapshot_details(entry.path)
|
snapshot_details = load_snapshot_details(entry.path)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
link = None
|
snapshot_details = None
|
||||||
if link:
|
if snapshot_details:
|
||||||
yield link
|
yield snapshot_details
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore
|
||||||
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||||
from .index import (
|
from .index import (
|
||||||
load_main_index,
|
load_main_index,
|
||||||
get_empty_snapshot_queryset,
|
|
||||||
parse_snapshots_from_source,
|
parse_snapshots_from_source,
|
||||||
filter_new_urls,
|
filter_new_urls,
|
||||||
write_main_index,
|
write_main_index,
|
||||||
|
@ -340,8 +339,8 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
pending_links: Dict[str, Link] = {}
|
pending_links: Dict[str, Link] = {}
|
||||||
|
|
||||||
if existing_index:
|
if existing_index:
|
||||||
all_links = load_main_index(out_dir=out_dir, warn=False)
|
all_snapshots = load_main_index(out_dir=out_dir, warn=False)
|
||||||
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
|
print(' √ Loaded {} snapshots from existing main index.'.format(all_snapshots.count()))
|
||||||
|
|
||||||
# Links in data folders that dont match their timestamp
|
# Links in data folders that dont match their timestamp
|
||||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||||
|
@ -361,22 +360,22 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||||
|
|
||||||
# Links in data dir indexes but not in main index
|
# Links in data dir indexes but not in main index
|
||||||
orphaned_data_dir_links = {
|
orphaned_data_dir_snapshots = {
|
||||||
link.url: link
|
snapshot.url: snapshot
|
||||||
for link in parse_json_links_details(out_dir)
|
for snapshot in parse_json_snapshot_details(out_dir)
|
||||||
if not all_links.filter(url=link.url).exists()
|
if not all_snapshots.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_data_dir_links:
|
if orphaned_data_dir_snapshots:
|
||||||
pending_links.update(orphaned_data_dir_links)
|
pending_snapshots.update(orphaned_data_dir_links)
|
||||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI))
|
||||||
|
|
||||||
# Links in invalid/duplicate data dirs
|
# Links in invalid/duplicate data dirs
|
||||||
invalid_folders = {
|
invalid_folders = {
|
||||||
folder: link
|
folder: snapshot
|
||||||
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
for folder, snapshot in get_invalid_folders(all_snapshots, out_dir=out_dir).items()
|
||||||
}
|
}
|
||||||
if invalid_folders:
|
if invalid_folders:
|
||||||
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
print(' {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||||
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue