mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 16:14:28 -04:00
Improve index purge script
This commit is contained in:
parent
c37941efd1
commit
fee4565194
1 changed files with 58 additions and 26 deletions
|
@ -1,54 +1,86 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
|
||||||
import re
|
import re
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from os.path import exists, join
|
||||||
|
from shutil import rmtree
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from archive import parse_json_link_index
|
from archive import parse_json_link_index
|
||||||
from config import OUTPUT_DIR
|
from config import ARCHIVE_DIR, OUTPUT_DIR
|
||||||
from index import write_json_links_index
|
from index import write_html_links_index, write_json_links_index
|
||||||
|
|
||||||
|
|
||||||
def cleanup_index(patterns: List[str], yes=False):
|
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
|
||||||
regexes = [re.compile(p) for p in patterns]
|
if not exists(join(OUTPUT_DIR, 'index.json')):
|
||||||
|
exit('index.json is missing; nothing to do')
|
||||||
index = parse_json_link_index(OUTPUT_DIR)
|
|
||||||
links = index['links']
|
|
||||||
|
|
||||||
|
compiled = [re.compile(r) for r in regexes]
|
||||||
|
links = parse_json_link_index(OUTPUT_DIR)['links']
|
||||||
filtered = []
|
filtered = []
|
||||||
remaining = []
|
remaining = []
|
||||||
|
|
||||||
for l in links:
|
for l in links:
|
||||||
url = l['url']
|
url = l['url']
|
||||||
for r in regexes:
|
for r in compiled:
|
||||||
if r.search(url):
|
if r.search(url):
|
||||||
filtered.append((l, r))
|
filtered.append((l, r))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
remaining.append(l)
|
remaining.append(l)
|
||||||
|
|
||||||
|
if not filtered:
|
||||||
|
exit('Search did not match any entries.')
|
||||||
|
|
||||||
|
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
|
||||||
|
|
||||||
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
|
|
||||||
for link, regex in filtered:
|
for link, regex in filtered:
|
||||||
url = link['url']
|
url = link['url']
|
||||||
print(" {url} via {regex}".format(url=url, regex=regex.pattern))
|
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
|
||||||
|
|
||||||
proceed = False
|
if not proceed:
|
||||||
if yes:
|
answer = input('Remove {} entries from index? [y/n] '.format(
|
||||||
proceed = True
|
len(filtered)))
|
||||||
else:
|
proceed = answer.strip().lower() in ('y', 'yes')
|
||||||
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
|
|
||||||
proceed = res.strip().lower() in ('y', 'yes')
|
|
||||||
|
|
||||||
if proceed:
|
if not proceed:
|
||||||
write_json_links_index(OUTPUT_DIR, remaining)
|
exit('Aborted')
|
||||||
else:
|
|
||||||
exit('aborting')
|
write_json_links_index(OUTPUT_DIR, remaining)
|
||||||
|
write_html_links_index(OUTPUT_DIR, remaining)
|
||||||
|
|
||||||
|
if delete:
|
||||||
|
for link, _ in filtered:
|
||||||
|
data_dir = join(ARCHIVE_DIR, link['timestamp'])
|
||||||
|
if exists(data_dir):
|
||||||
|
rmtree(data_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
p = argparse.ArgumentParser('Index purging tool')
|
p = ArgumentParser('Index purging tool')
|
||||||
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out')
|
p.add_argument(
|
||||||
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation')
|
'--regex',
|
||||||
|
'-r',
|
||||||
|
action='append',
|
||||||
|
help='Regular expression matching URLs to purge',
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
'--delete',
|
||||||
|
'-d',
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
help='Delete webpage files from archive',
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
'--yes',
|
||||||
|
'-y',
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
help='Do not prompt for confirmation',
|
||||||
|
)
|
||||||
|
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
regexes = args.regex
|
if args.regex:
|
||||||
cleanup_index(regexes, yes=args.yes)
|
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
|
||||||
|
else:
|
||||||
|
p.print_help()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue