Improve index purge script

This commit is contained in:
luoliyan 2019-02-05 06:48:49 +09:30
parent c37941efd1
commit fee4565194

View file

@ -1,54 +1,86 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import re import re
from argparse import ArgumentParser
from os.path import exists, join
from shutil import rmtree
from typing import List from typing import List
from archive import parse_json_link_index from archive import parse_json_link_index
from config import OUTPUT_DIR from config import ARCHIVE_DIR, OUTPUT_DIR
from index import write_json_links_index from index import write_html_links_index, write_json_links_index
def cleanup_index(patterns: List[str], yes=False): def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
regexes = [re.compile(p) for p in patterns] if not exists(join(OUTPUT_DIR, 'index.json')):
exit('index.json is missing; nothing to do')
index = parse_json_link_index(OUTPUT_DIR)
links = index['links']
compiled = [re.compile(r) for r in regexes]
links = parse_json_link_index(OUTPUT_DIR)['links']
filtered = [] filtered = []
remaining = [] remaining = []
for l in links: for l in links:
url = l['url'] url = l['url']
for r in regexes: for r in compiled:
if r.search(url): if r.search(url):
filtered.append((l, r)) filtered.append((l, r))
break break
else: else:
remaining.append(l) remaining.append(l)
if not filtered:
exit('Search did not match any entries.')
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
for link, regex in filtered: for link, regex in filtered:
url = link['url'] url = link['url']
print(" {url} via {regex}".format(url=url, regex=regex.pattern)) print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
proceed = False if not proceed:
if yes: answer = input('Remove {} entries from index? [y/n] '.format(
proceed = True len(filtered)))
else: proceed = answer.strip().lower() in ('y', 'yes')
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
proceed = res.strip().lower() in ('y', 'yes')
if proceed: if not proceed:
write_json_links_index(OUTPUT_DIR, remaining) exit('Aborted')
else:
exit('aborting') write_json_links_index(OUTPUT_DIR, remaining)
write_html_links_index(OUTPUT_DIR, remaining)
if delete:
for link, _ in filtered:
data_dir = join(ARCHIVE_DIR, link['timestamp'])
if exists(data_dir):
rmtree(data_dir)
if __name__ == '__main__': if __name__ == '__main__':
p = argparse.ArgumentParser('Index purging tool') p = ArgumentParser('Index purging tool')
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out') p.add_argument(
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation') '--regex',
'-r',
action='append',
help='Regular expression matching URLs to purge',
)
p.add_argument(
'--delete',
'-d',
action='store_true',
default=False,
help='Delete webpage files from archive',
)
p.add_argument(
'--yes',
'-y',
action='store_true',
default=False,
help='Do not prompt for confirmation',
)
args = p.parse_args() args = p.parse_args()
regexes = args.regex if args.regex:
cleanup_index(regexes, yes=args.yes) cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
else:
p.print_help()