Merge pull request #138 from luoliyan/improve-purge-script

Improve index purge script
This commit is contained in:
Nick Sweeting 2019-02-04 17:49:22 -08:00 committed by GitHub
commit eeb89ae2cc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,54 +1,86 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse
import re import re
from argparse import ArgumentParser
from os.path import exists, join
from shutil import rmtree
from typing import List from typing import List
from archive import parse_json_link_index from archive import parse_json_link_index
from config import OUTPUT_DIR from config import ARCHIVE_DIR, OUTPUT_DIR
from index import write_json_links_index from index import write_html_links_index, write_json_links_index
def cleanup_index(patterns: List[str], yes=False): def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
regexes = [re.compile(p) for p in patterns] if not exists(join(OUTPUT_DIR, 'index.json')):
exit('index.json is missing; nothing to do')
index = parse_json_link_index(OUTPUT_DIR)
links = index['links']
compiled = [re.compile(r) for r in regexes]
links = parse_json_link_index(OUTPUT_DIR)['links']
filtered = [] filtered = []
remaining = [] remaining = []
for l in links: for l in links:
url = l['url'] url = l['url']
for r in regexes: for r in compiled:
if r.search(url): if r.search(url):
filtered.append((l, r)) filtered.append((l, r))
break break
else: else:
remaining.append(l) remaining.append(l)
if not filtered:
exit('Search did not match any entries.')
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
for link, regex in filtered: for link, regex in filtered:
url = link['url'] url = link['url']
print(" {url} via {regex}".format(url=url, regex=regex.pattern)) print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
proceed = False if not proceed:
if yes: answer = input('Remove {} entries from index? [y/n] '.format(
proceed = True len(filtered)))
else: proceed = answer.strip().lower() in ('y', 'yes')
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
proceed = res.strip().lower() in ('y', 'yes') if not proceed:
exit('Aborted')
if proceed:
write_json_links_index(OUTPUT_DIR, remaining) write_json_links_index(OUTPUT_DIR, remaining)
else: write_html_links_index(OUTPUT_DIR, remaining)
exit('aborting')
if delete:
for link, _ in filtered:
data_dir = join(ARCHIVE_DIR, link['timestamp'])
if exists(data_dir):
rmtree(data_dir)
if __name__ == '__main__': if __name__ == '__main__':
p = argparse.ArgumentParser('Index purging tool') p = ArgumentParser('Index purging tool')
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out') p.add_argument(
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation') '--regex',
'-r',
action='append',
help='Regular expression matching URLs to purge',
)
p.add_argument(
'--delete',
'-d',
action='store_true',
default=False,
help='Delete webpage files from archive',
)
p.add_argument(
'--yes',
'-y',
action='store_true',
default=False,
help='Do not prompt for confirmation',
)
args = p.parse_args() args = p.parse_args()
regexes = args.regex if args.regex:
cleanup_index(regexes, yes=args.yes) cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
else:
p.print_help()