mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
add archivebox info command to scan data dir
This commit is contained in:
parent
50b947f41d
commit
ab68819332
4 changed files with 105 additions and 3 deletions
28
archivebox/cli/archivebox_info.py
Normal file
28
archivebox/cli/archivebox_info.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox info'
|
||||
__description__ = 'Print out some info and statistics about the archive collection'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from ..legacy.main import info
|
||||
from ..legacy.util import reject_stdin
|
||||
|
||||
|
||||
def main(args=None):
|
||||
args = sys.argv[1:] if args is None else args
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=__description__,
|
||||
add_help=True,
|
||||
)
|
||||
parser.parse_args(args)
|
||||
reject_stdin(__command__)
|
||||
|
||||
info()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -5,7 +5,12 @@ import shutil
|
|||
from typing import List, Optional, Iterable
|
||||
|
||||
from .schema import Link
|
||||
from .util import enforce_types, TimedProgress
|
||||
from .util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
get_dir_size,
|
||||
human_readable_size,
|
||||
)
|
||||
from .index import (
|
||||
links_after_timestamp,
|
||||
load_main_index,
|
||||
|
@ -119,6 +124,47 @@ def init():
|
|||
print(' archivebox help')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def info():
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
|
||||
print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI))
|
||||
print(f' {OUTPUT_DIR}')
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
|
||||
size = human_readable_size(num_bytes)
|
||||
print(f' > Index Size: {size} across {num_files} files in')
|
||||
print()
|
||||
|
||||
print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
|
||||
print(f' {ARCHIVE_DIR}')
|
||||
|
||||
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
|
||||
size = human_readable_size(num_bytes)
|
||||
print(f' > Total Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print()
|
||||
|
||||
link_data_dirs = {link.link_dir for link in all_links}
|
||||
valid_archive_dirs = set()
|
||||
num_invalid = 0
|
||||
for entry in os.scandir(ARCHIVE_DIR):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
||||
valid_archive_dirs.add(entry.path)
|
||||
else:
|
||||
num_invalid += 1
|
||||
|
||||
print(f' > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
|
||||
|
||||
num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
|
||||
print(f' > {num_unarchived} missing data directories (directories missing for links in the index)')
|
||||
|
||||
print(f' > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
|
||||
|
||||
num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
|
||||
print(f' > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
|
||||
|
|
|
@ -27,7 +27,6 @@ MAIN_INDEX_HEADER = {
|
|||
'copyright_info': FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'cmd': sys.argv,
|
||||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
|
@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
|||
**MAIN_INDEX_HEADER,
|
||||
'num_links': len(links),
|
||||
'updated': datetime.now(),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': links,
|
||||
}
|
||||
atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
|
||||
|
|
|
@ -7,7 +7,7 @@ import shutil
|
|||
|
||||
from string import Template
|
||||
from json import JSONEncoder
|
||||
from typing import List, Optional, Any, Union, IO, Mapping
|
||||
from typing import List, Optional, Any, Union, IO, Mapping, Tuple
|
||||
from inspect import signature
|
||||
from functools import wraps
|
||||
from hashlib import sha256
|
||||
|
@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str):
|
|||
with open(from_path, 'rb') as src:
|
||||
atomic_write(src.read(), to_path)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]:
|
||||
num_bytes, num_dirs, num_files = 0, 0, 0
|
||||
for entry in os.scandir(path):
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
return num_bytes, num_dirs, num_files
|
||||
|
||||
|
||||
@enforce_types
|
||||
def human_readable_size(num_bytes: Union[int, float]) -> str:
|
||||
for count in ['Bytes','KB','MB','GB']:
|
||||
if num_bytes > -1024.0 and num_bytes < 1024.0:
|
||||
return '%3.1f%s' % (num_bytes, count)
|
||||
num_bytes /= 1024.0
|
||||
return '%3.1f%s' % (num_bytes, 'TB')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def chrome_args(**options) -> List[str]:
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue