diff --git a/archivebox/config.py b/archivebox/config.py index 791c51a7..7235e7ca 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,7 +47,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') +URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) @@ -270,6 +270,7 @@ except: URL_BLACKLIST = re.compile( r'(.*\.youtube\.com)|' + r'(.*\.facebook\.com)|' r'(.*\.amazon\.com)|' r'(.*\.reddit\.com)', re.IGNORECASE, diff --git a/archivebox/links.py b/archivebox/links.py index ba8057a5..8ca9df94 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -28,13 +28,19 @@ from util import ( check_links_structure, ) +from config import ( + URL_BLACKLIST, +) def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - + links = exclude_links(links) # exclude links that are in blacklist + + print(links) + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -42,7 +48,8 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + + print("FINAL LIST", list(links)) return list(links) @@ -115,3 +122,10 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp + +def exclude_links(links): + """ exclude links that are in blacklist""" + + links = [link for link in links if not URL_BLACKLIST.match(link['url'])] + + return links \ No newline at end of file