diff --git a/archivebox/legacy/logs.py b/archivebox/legacy/logs.py index 191f76b1..941f49d9 100644 --- a/archivebox/legacy/logs.py +++ b/archivebox/legacy/logs.py @@ -3,7 +3,7 @@ import sys from datetime import datetime from dataclasses import dataclass -from typing import Optional +from typing import Optional, List from .schema import Link, ArchiveResult from .config import ANSI, OUTPUT_DIR @@ -205,3 +205,58 @@ def log_archive_method_finished(result: ArchiveResult): if line )) print() + + +def log_list_started(filter_patterns: List[str], filter_type: str): + print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format( + filter_type, + **ANSI, + )) + print(' {}'.format(' '.join(filter_patterns))) + +def log_list_finished(links): + from .util import to_csv + print() + print('---------------------------------------------------------------------------------------------------') + print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) + print('---------------------------------------------------------------------------------------------------') + print() + + +def log_removal_started(links: List[Link], yes: bool, delete: bool): + + log_list_finished(links) + print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI)) + if delete: + file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)] + print( + f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' + f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' + ) + else: + print( + f' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' + f' (Pass --delete if you also want to permanently delete the data folders)' + ) + + if not yes: + print() + print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI)) + try: + assert input(' y/[n]: ').lower() == 'y' + except (KeyboardInterrupt, EOFError, AssertionError): + raise SystemExit(0) + +def log_removal_finished(all_links: int, to_keep: int): + if all_links == 0: + print() + print('{red}[X] No matching links found.{reset}'.format(**ANSI)) + else: + num_removed = all_links - to_keep + print() + print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format( + num_removed, + all_links, + **ANSI, + )) + print(' Index now contains {} links.'.format(to_keep)) diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py index b669c5cc..3f2f21a5 100644 --- a/archivebox/legacy/main.py +++ b/archivebox/legacy/main.py @@ -4,7 +4,7 @@ import shutil from typing import List, Optional, Iterable from .schema import Link -from .util import enforce_types, TimedProgress, to_csv +from .util import enforce_types, TimedProgress from .index import ( links_after_timestamp, load_links_index, @@ -21,6 +21,10 @@ from .logs import ( log_archiving_started, log_archiving_paused, log_archiving_finished, + log_removal_started, + log_removal_finished, + log_list_started, + log_list_finished, ) @@ -69,6 +73,7 @@ LINK_FILTERS = { 'domain': lambda link, pattern: link.domain == pattern, } +@enforce_types def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool: for pattern in filter_patterns: if LINK_FILTERS[filter_type](link, pattern): @@ -99,12 +104,10 @@ def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: st @enforce_types def remove_archive_links(filter_patterns: List[str], filter_type: str='exact', after: Optional[float]=None, before: Optional[float]=None, - yes: bool=False, delete: bool=False): + yes: bool=False, delete: bool=False) -> List[Link]: check_dependencies() - - print('[*] Finding links in the archive index matching these {} patterns:'.format(filter_type)) - print(' {}'.format(' '.join(filter_patterns))) + log_list_started(filter_patterns, filter_type) timer = TimedProgress(360, prefix=' ') try: links = list(list_archive_data( @@ -116,37 +119,28 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact', finally: timer.end() if not len(links): - print() - print('{red}[X] No matching links found.{reset}'.format(**ANSI)) + log_removal_finished(0, 0) raise SystemExit(1) - print() - print('-------------------------------------------------------------------') - print(to_csv(links, csv_cols=['link_dir', 'url', 'is_archived', 'num_outputs'])) - print('-------------------------------------------------------------------') - print() - if not yes: - resp = input('{lightyellow}[?] Are you sure you want to permanently remove these {} archived links? N/y: {reset}'.format(len(links), **ANSI)) - - if not resp.lower() == 'y': - raise SystemExit(0) + log_removal_started(links, yes=yes, delete=delete) + timer = TimedProgress(360, prefix=' ') + try: + to_keep = [] + all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + for link in all_links: + should_remove = ( + (after is not None and float(link.timestamp) < after) + or (before is not None and float(link.timestamp) > before) + or link_matches_filter(link, filter_patterns, filter_type) + ) + if not should_remove: + to_keep.append(link) + elif should_remove and delete: + shutil.rmtree(link.link_dir) + finally: + timer.end() - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) - to_keep = [] - - for link in all_links: - should_remove = ( - (after is not None and float(link.timestamp) < after) - or (before is not None and float(link.timestamp) > before) - or link_matches_filter(link, filter_patterns, filter_type) - ) - if not should_remove: - to_keep.append(link) - elif should_remove and delete: - shutil.rmtree(link.link_dir) - - num_removed = len(all_links) - len(to_keep) write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True) - print() - print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(num_removed, len(all_links), **ANSI)) - print(' Index now contains {} links.'.format(len(to_keep))) + log_removal_finished(len(all_links), len(to_keep)) + + return to_keep diff --git a/archivebox/legacy/schema.py b/archivebox/legacy/schema.py index 8b5ca6db..08fb6b70 100644 --- a/archivebox/legacy/schema.py +++ b/archivebox/legacy/schema.py @@ -64,12 +64,12 @@ class ArchiveResult: return to_json(self, indent=indent, sort_keys=sort_keys) - def to_csv(self, cols=None): + def to_csv(self, cols=None, ljust: int=0, separator: str=','): from .util import to_json cols = cols or self.field_names() - return ','.join( - to_json(getattr(self, col), indent=False) + return separator.join( + to_json(getattr(self, col), indent=False).ljust(ljust) for col in cols ) @@ -187,11 +187,11 @@ class Link: return to_json(self, indent=indent, sort_keys=sort_keys) - def to_csv(self, csv_cols: List[str]): + def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','): from .util import to_json - return ','.join( - to_json(getattr(self, col), indent=None) + return separator.join( + to_json(getattr(self, col), indent=None).ljust(ljust) for col in csv_cols ) diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py index 6763f9ad..ffcac217 100644 --- a/archivebox/legacy/util.py +++ b/archivebox/legacy/util.py @@ -624,10 +624,20 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) -def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True) -> str: +def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, + header: bool=True, ljust: int=0, separator: str=',') -> str: csv_cols = csv_cols or ['timestamp', 'is_archived', 'url'] - header_str = '{}\n'.format(','.join(csv_cols)) if header else '' - return header_str + '\n'.join(link.to_csv(csv_cols=csv_cols) for link in links) + + header_str = '' + if header: + header_str = separator.join(col.ljust(ljust) for col in csv_cols) + + row_strs = ( + link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator) + for link in links + ) + + return '\n'.join((header_str, *row_strs)) def atomic_write(contents: Union[dict, str], path: str) -> None: