Merge pull request #112 from karlicoss/purger

Add script to remove entries from index
This commit is contained in:
Nick Sweeting 2018-11-12 10:51:53 -05:00 committed by GitHub
commit e302199f54
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 0 deletions

View file

@ -391,6 +391,10 @@ Not all sites can be effectively archived with each method, that's why it's best
If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues) If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues)
with some of the URLs that failed to be archived and I'll investigate. with some of the URLs that failed to be archived and I'll investigate.
**Removing unwanted links from the index:**
If you accidentally added lots of unwanted links into index and they slow down your archiving, you can use the `bin/purge` script to remove them from your index, which removes everything matching python regexes you pass into it. E.g: `bin/purge -r 'amazon\.com' -r 'google\.com'`. It would prompt before removing links from index, but for extra safety you might want to back up `index.json` first (or put in undex version control).
### Hosting the Archive ### Hosting the Archive
If you're having issues trying to host the archive via nginx, make sure you already have nginx running with SSL. If you're having issues trying to host the archive via nginx, make sure you already have nginx running with SSL.

54
archiver/purge.py Executable file
View file

@ -0,0 +1,54 @@
#!/usr/bin/env python3
import argparse
import re
from typing import List
from archive import parse_json_link_index
from config import OUTPUT_DIR
from index import write_json_links_index
def cleanup_index(patterns: List[str], yes=False):
regexes = [re.compile(p) for p in patterns]
index = parse_json_link_index(OUTPUT_DIR)
links = index['links']
filtered = []
remaining = []
for l in links:
url = l['url']
for r in regexes:
if r.search(url):
filtered.append((l, r))
break
else:
remaining.append(l)
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
for link, regex in filtered:
url = link['url']
print(" {url} via {regex}".format(url=url, regex=regex.pattern))
proceed = False
if yes:
proceed = True
else:
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
proceed = res.strip().lower() in ('y', 'yes')
if proceed:
write_json_links_index(OUTPUT_DIR, remaining)
else:
exit('aborting')
if __name__ == '__main__':
p = argparse.ArgumentParser('Index purging tool')
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out')
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation')
args = p.parse_args()
regexes = args.regex
cleanup_index(regexes, yes=args.yes)

1
bin/purge Symbolic link
View file

@ -0,0 +1 @@
../archiver/purge.py