diff --git a/archivebox/config.py b/archivebox/config.py index 29ed2df2..0d49a5d2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,6 +47,8 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) +URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) + try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) except Exception: @@ -74,6 +76,8 @@ USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) +URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) + ########################### Environment & Dependencies ######################### try: diff --git a/archivebox/links.py b/archivebox/links.py index ba8057a5..0ec33fe5 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -28,13 +28,16 @@ from util import ( check_links_structure, ) +from config import ( + URL_BLACKLIST, +) def validate_links(links): check_links_structure(links) - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls - links = sorted_links(links) # deterministically sort the links based on timstamp, url - + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + links = sorted_links(links) # deterministically sort the links based on timstamp, url + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -48,11 +51,11 @@ def validate_links(links): def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" - return ( - link - for link in links - if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://')) - ) + for link in links: + scheme_is_valid = scheme(url) in ('http', 'https', 'ftp) + not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True + if scheme_is_valid and not_blacklisted: + yield link def uniquefied_links(sorted_links): @@ -115,3 +118,5 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp + +