mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-17 00:24:26 -04:00
refactor: remove command functional
This commit is contained in:
parent
9aa934a410
commit
9fdcb9857e
3 changed files with 31 additions and 33 deletions
|
@ -43,7 +43,6 @@ from .html import (
|
||||||
)
|
)
|
||||||
from .json import (
|
from .json import (
|
||||||
load_json_snapshot,
|
load_json_snapshot,
|
||||||
parse_json_snapshot_details,
|
|
||||||
write_json_snapshot_details,
|
write_json_snapshot_details,
|
||||||
)
|
)
|
||||||
from .sql import (
|
from .sql import (
|
||||||
|
@ -321,7 +320,7 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
|
||||||
"""
|
"""
|
||||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||||
|
|
||||||
existing_snapshot = load_json_snapshot_details(out_dir)
|
existing_snapshot = load_json_snapshot_details(Path(out_dir))
|
||||||
if existing_snapshot:
|
if existing_snapshot:
|
||||||
return merge_snapshots(existing_snapshot, snapshot)
|
return merge_snapshots(existing_snapshot, snapshot)
|
||||||
|
|
||||||
|
@ -402,7 +401,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
if entry.is_dir():
|
if entry.is_dir():
|
||||||
snapshot = None
|
snapshot = None
|
||||||
try:
|
try:
|
||||||
snapshot = parse_json_snapshot_details(entry.path)
|
snapshot = load_json_snapshot(Path(entry.path))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -441,7 +440,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
||||||
path = path.snapshot_dir
|
path = path.snapshot_dir
|
||||||
|
|
||||||
try:
|
try:
|
||||||
snapshot = load_json_snapshot_details(path)
|
snapshot = load_json_snapshot(Path(path))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -465,7 +464,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
if entry.is_dir():
|
if entry.is_dir():
|
||||||
snapshot = None
|
snapshot = None
|
||||||
try:
|
try:
|
||||||
snapshot = parse_json_snapshot_details(str(entry))
|
snapshot = load_json_snapshot(str(entry))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -492,7 +491,7 @@ def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, O
|
||||||
index_exists = (entry / "index.json").exists()
|
index_exists = (entry / "index.json").exists()
|
||||||
snapshot = None
|
snapshot = None
|
||||||
try:
|
try:
|
||||||
snapshot = parse_json_snapshot_details(str(entry))
|
snapshot = load_json_snapshot(entry)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# Try to fix index
|
# Try to fix index
|
||||||
if index_exists:
|
if index_exists:
|
||||||
|
@ -562,13 +561,13 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir(follow_symlinks=True):
|
||||||
if (Path(entry.path) / 'index.json').exists():
|
if (Path(entry.path) / 'index.json').exists():
|
||||||
try:
|
try:
|
||||||
snapshot = parse_json_snapshot_details(entry.path)
|
snapshot = load_json_snapshot(Path(entry.path))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
snapshot = None
|
snapshot = None
|
||||||
if not snapshot:
|
if not snapshot:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not entry.path.endswith(f'/{link.timestamp}'):
|
if not entry.path.endswith(f'/{snapshot.timestamp}'):
|
||||||
dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
|
dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
cant_fix.append(entry.path)
|
cant_fix.append(entry.path)
|
||||||
|
|
|
@ -395,49 +395,49 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||||
))
|
))
|
||||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||||
|
|
||||||
def log_list_finished(links):
|
def log_list_finished(snapshots):
|
||||||
from .index.csv import links_to_csv
|
from .index.csv import snapshots_to_csv
|
||||||
print()
|
print()
|
||||||
print('---------------------------------------------------------------------------------------------------')
|
print('---------------------------------------------------------------------------------------------------')
|
||||||
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
print(snapshots_to_csv(snapshots, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||||
print('---------------------------------------------------------------------------------------------------')
|
print('---------------------------------------------------------------------------------------------------')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
def log_removal_started(snapshots: List["Snapshot"], yes: bool, delete: bool):
|
||||||
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
|
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(snapshots), **ANSI))
|
||||||
if delete:
|
if delete:
|
||||||
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
|
file_counts = [snapshot.num_outputs for snapshot in snapshots if Path(snapshot.snapshot_dir).exists()]
|
||||||
print(
|
print(
|
||||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
f' {len(snapshots)} Snapshots will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
' Matching snapshots will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||||
)
|
)
|
||||||
|
|
||||||
if not yes:
|
if not yes:
|
||||||
print()
|
print()
|
||||||
print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
|
print('{lightyellow}[?] Do you want to proceed with removing these {} snapshots?{reset}'.format(len(snapshots), **ANSI))
|
||||||
try:
|
try:
|
||||||
assert input(' y/[n]: ').lower() == 'y'
|
assert input(' y/[n]: ').lower() == 'y'
|
||||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
def log_removal_finished(all_links: int, to_remove: int):
|
def log_removal_finished(all_snapshots: int, to_remove: int):
|
||||||
if all_links == 0:
|
if to_remove == 0:
|
||||||
print()
|
print()
|
||||||
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
|
print('{red}[X] No matching snapshots found.{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
print()
|
print()
|
||||||
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
|
print('{red}[√] Removed {} out of {} snapshots from the archive index.{reset}'.format(
|
||||||
to_remove,
|
to_remove,
|
||||||
all_links,
|
all_snapshots,
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
print(' Index now contains {} links.'.format(all_links - to_remove))
|
print(' Index now contains {} snapshots.'.format(all_snapshots - to_remove))
|
||||||
|
|
||||||
|
|
||||||
def log_shell_welcome_msg():
|
def log_shell_welcome_msg():
|
||||||
|
|
|
@ -336,7 +336,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
||||||
|
|
||||||
all_links = Snapshot.objects.none()
|
all_links = Snapshot.objects.none()
|
||||||
pending_links: Dict[str, Link] = {}
|
pending_snapshots: Dict[str, Link] = {}
|
||||||
|
|
||||||
if existing_index:
|
if existing_index:
|
||||||
all_snapshots = load_main_index(out_dir=out_dir, warn=False)
|
all_snapshots = load_main_index(out_dir=out_dir, warn=False)
|
||||||
|
@ -363,10 +363,10 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
orphaned_data_dir_snapshots = {
|
orphaned_data_dir_snapshots = {
|
||||||
snapshot.url: snapshot
|
snapshot.url: snapshot
|
||||||
for snapshot in parse_json_snapshot_details(out_dir)
|
for snapshot in parse_json_snapshot_details(out_dir)
|
||||||
if not all_snapshots.filter(url=link.url).exists()
|
if not all_snapshots.filter(url=snapshot.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_data_dir_snapshots:
|
if orphaned_data_dir_snapshots:
|
||||||
pending_snapshots.update(orphaned_data_dir_links)
|
pending_snapshots.update(orphaned_data_dir_snapshots)
|
||||||
print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI))
|
print(' {lightyellow}√ Added {} orphaned snapshots from existing archive directories.{reset}'.format(len(orphaned_data_dir_snapshots), **ANSI))
|
||||||
|
|
||||||
# Links in invalid/duplicate data dirs
|
# Links in invalid/duplicate data dirs
|
||||||
|
@ -383,7 +383,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
print(' archivebox list --status=invalid')
|
print(' archivebox list --status=invalid')
|
||||||
|
|
||||||
|
|
||||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
write_main_index(list(pending_snapshots.values()), out_dir=out_dir)
|
||||||
|
|
||||||
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
if existing_index:
|
if existing_index:
|
||||||
|
@ -656,24 +656,23 @@ def remove(filter_str: Optional[str]=None,
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
log_links = [link.as_link() for link in snapshots]
|
log_list_finished(snapshots)
|
||||||
log_list_finished(log_links)
|
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||||
log_removal_started(log_links, yes=yes, delete=delete)
|
|
||||||
|
|
||||||
timer = TimedProgress(360, prefix=' ')
|
timer = TimedProgress(360, prefix=' ')
|
||||||
try:
|
try:
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
if delete:
|
if delete:
|
||||||
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
shutil.rmtree(snapshot.snapshot_dir, ignore_errors=True)
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
to_remove = snapshots.count()
|
to_remove = snapshots.count()
|
||||||
|
all_snapshots = load_main_index(out_dir=out_dir).count()
|
||||||
|
|
||||||
flush_search_index(snapshots=snapshots)
|
flush_search_index(snapshots=snapshots)
|
||||||
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||||
all_snapshots = load_main_index(out_dir=out_dir)
|
log_removal_finished(all_snapshots, to_remove)
|
||||||
log_removal_finished(all_snapshots.count(), to_remove)
|
|
||||||
|
|
||||||
return all_snapshots
|
return all_snapshots
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue