mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 08:04:26 -04:00
refactor: status command is functional
This commit is contained in:
parent
d92083b928
commit
973f8b6abc
5 changed files with 33 additions and 26 deletions
|
@ -138,7 +138,7 @@ class Snapshot(models.Model):
|
|||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
from ..config import CONFIG
|
||||
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
|
||||
return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
|
@ -173,6 +173,12 @@ class Snapshot(models.Model):
|
|||
from ..util import is_static_file
|
||||
return is_static_file(self.url)
|
||||
|
||||
@cached_property
|
||||
def details(self) -> Dict:
|
||||
# TODO: Define what details are, and return them accordingly
|
||||
return {"history": {}}
|
||||
|
||||
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""predict the expected output paths that should be present after archiving"""
|
||||
|
|
|
@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
|
|||
details = {"history": {}}
|
||||
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||
else:
|
||||
details = load_snapshot_details(snapshot)
|
||||
details = snapshot.details
|
||||
|
||||
#log_link_archiving_started(link, out_dir, is_new)
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
|
|
@ -42,7 +42,7 @@ from .html import (
|
|||
write_html_snapshot_details,
|
||||
)
|
||||
from .json import (
|
||||
load_json_snapshot_details,
|
||||
load_json_snapshot,
|
||||
parse_json_snapshot_details,
|
||||
write_json_snapshot_details,
|
||||
)
|
||||
|
@ -441,7 +441,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
|||
path = path.snapshot_dir
|
||||
|
||||
try:
|
||||
snapshot = parse_json_snapshot_details(path)
|
||||
snapshot = load_json_snapshot_details(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
@ -530,7 +530,7 @@ def is_valid(snapshot: Model) -> bool:
|
|||
if dir_exists and index_exists:
|
||||
try:
|
||||
# TODO: review if the `guess` was necessary here
|
||||
parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir)
|
||||
parsed_snapshot = load_json_snapshot(snapshot.snapshot_dir)
|
||||
return snapshot.url == parsed_snapshot.url
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
@ -91,17 +91,18 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
|
|||
|
||||
|
||||
@enforce_types
|
||||
def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
|
||||
def load_json_snapshot(out_dir: Path) -> Optional[Model]:
|
||||
"""
|
||||
Loads the detail from the local json index
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
|
||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if existing_index.exists():
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
output = pyjson.load(f)
|
||||
if "history" not in output.keys():
|
||||
output["history"] = {}
|
||||
output = Snapshot.from_json(output)
|
||||
return output
|
||||
except pyjson.JSONDecodeError:
|
||||
pass
|
||||
|
@ -110,13 +111,13 @@ def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
|
|||
|
||||
@enforce_types
|
||||
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
|
||||
"""read through all the archive data folders and return the parsed links"""
|
||||
"""read through all the archive data folders and return the parsed snapshots"""
|
||||
|
||||
for entry in os.scandir(Path(out_dir)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if (Path(entry.path) / 'index.json').exists():
|
||||
try:
|
||||
snapshot_details = load_snapshot_details(entry.path)
|
||||
snapshot_details = load_json_snapshot_details(entry.path)
|
||||
except KeyError:
|
||||
snapshot_details = None
|
||||
if snapshot_details:
|
||||
|
|
|
@ -427,11 +427,11 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
print(f' Index size: {size} across {num_files} files')
|
||||
print()
|
||||
|
||||
links = load_main_index(out_dir=out_dir)
|
||||
num_sql_links = links.count()
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
|
||||
snapshots = load_main_index(out_dir=out_dir)
|
||||
num_sql_snapshots = snapshots.count()
|
||||
num_snapshot_details = sum(1 for snapshot in parse_json_snapshot_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_snapshots} snapshots'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_snapshot_details} snapshots'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
|
||||
print()
|
||||
print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
|
||||
print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
|
||||
|
@ -439,23 +439,23 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print(ANSI['black'])
|
||||
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
|
||||
num_archived = len(get_archived_folders(links, out_dir=out_dir))
|
||||
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
|
||||
num_indexed = len(get_indexed_folders(snapshots, out_dir=out_dir))
|
||||
num_archived = len(get_archived_folders(snapshots, out_dir=out_dir))
|
||||
num_unarchived = len(get_unarchived_folders(snapshots, out_dir=out_dir))
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||
|
||||
num_present = len(get_present_folders(links, out_dir=out_dir))
|
||||
num_valid = len(get_valid_folders(links, out_dir=out_dir))
|
||||
num_present = len(get_present_folders(snapshots, out_dir=out_dir))
|
||||
num_valid = len(get_valid_folders(snapshots, out_dir=out_dir))
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
|
||||
|
||||
duplicate = get_duplicate_folders(links, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(links, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(links, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
|
||||
duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
|
||||
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
|
||||
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||
|
@ -466,7 +466,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
print(ANSI['reset'])
|
||||
|
||||
if num_indexed:
|
||||
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
|
||||
print(' {lightred}Hint:{reset} You can list snapshot data directories by status like so:'.format(**ANSI))
|
||||
print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
|
||||
|
||||
if orphaned:
|
||||
|
@ -495,7 +495,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
print(' archivebox manage createsuperuser')
|
||||
|
||||
print()
|
||||
for snapshot in links.order_by('-updated')[:10]:
|
||||
for snapshot in snapshots.order_by('-updated')[:10]:
|
||||
if not snapshot.updated:
|
||||
continue
|
||||
print(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue