From 417ee9e302e3b5edcf94e55bae4b06d4f9080796 Mon Sep 17 00:00:00 2001 From: mlazana Date: Sat, 23 Mar 2019 21:27:41 +0200 Subject: [PATCH 1/6] add env variable URL_BLACKLIST --- archivebox/config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/archivebox/config.py b/archivebox/config.py index d8e01b24..791c51a7 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,6 +47,8 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) +URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') + try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) except Exception: @@ -265,3 +267,10 @@ except KeyboardInterrupt: except: print('[X] There was an error during the startup procedure, your archive data is unaffected.') raise + +URL_BLACKLIST = re.compile( + r'(.*\.youtube\.com)|' + r'(.*\.amazon\.com)|' + r'(.*\.reddit\.com)', + re.IGNORECASE, + ) \ No newline at end of file From 4d1056847750e5ba2aa1cee0800c43ceb68e1bea Mon Sep 17 00:00:00 2001 From: mlazana Date: Sun, 24 Mar 2019 14:40:26 +0200 Subject: [PATCH 2/6] exclude links that are in blacklist --- archivebox/config.py | 3 ++- archivebox/links.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 791c51a7..7235e7ca 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,7 +47,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') +URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) @@ -270,6 +270,7 @@ except: URL_BLACKLIST = re.compile( r'(.*\.youtube\.com)|' + r'(.*\.facebook\.com)|' r'(.*\.amazon\.com)|' r'(.*\.reddit\.com)', re.IGNORECASE, diff --git a/archivebox/links.py b/archivebox/links.py index ba8057a5..8ca9df94 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -28,13 +28,19 @@ from util import ( check_links_structure, ) +from config import ( + URL_BLACKLIST, +) def validate_links(links): check_links_structure(links) links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - + links = exclude_links(links) # exclude links that are in blacklist + + print(links) + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -42,7 +48,8 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + + print("FINAL LIST", list(links)) return list(links) @@ -115,3 +122,10 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp + +def exclude_links(links): + """ exclude links that are in blacklist""" + + links = [link for link in links if not URL_BLACKLIST.match(link['url'])] + + return links \ No newline at end of file From 81d846427e95a80cc92bac0b28f04c2e8d06ccf3 Mon Sep 17 00:00:00 2001 From: mlazana Date: Sun, 24 Mar 2019 19:04:22 +0200 Subject: [PATCH 3/6] fix comments in links.py --- archivebox/links.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index 8ca9df94..fd8985ec 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -38,9 +38,7 @@ def validate_links(links): links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url links = exclude_links(links) # exclude links that are in blacklist - - print(links) - + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -49,7 +47,6 @@ def validate_links(links): link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - print("FINAL LIST", list(links)) return list(links) @@ -124,7 +121,7 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): return new_timestamp def exclude_links(links): - """ exclude links that are in blacklist""" + """exclude links that are in blacklist""" links = [link for link in links if not URL_BLACKLIST.match(link['url'])] From 8502fa5cc3aa608a546fd93483f113a826b02332 Mon Sep 17 00:00:00 2001 From: mlazana Date: Wed, 27 Mar 2019 20:10:05 +0200 Subject: [PATCH 4/6] config.py: update function exclude_blacklisted(links) --- archivebox/links.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index fd8985ec..5eff61f4 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -34,11 +34,11 @@ from config import ( def validate_links(links): check_links_structure(links) - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls - links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = exclude_links(links) # exclude links that are in blacklist - + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + links = sorted_links(links) # deterministically sort the links based on timstamp, url + links = list(exclude_links(links)) # exclude URLs that match the blacklisted url pattern regex + if not links: print('[X] No links found :(') raise SystemExit(1) @@ -46,7 +46,7 @@ def validate_links(links): for link in links: link['title'] = unescape(link['title'].strip()) if link['title'] else None check_link_structure(link) - + return list(links) @@ -120,9 +120,8 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): return new_timestamp -def exclude_links(links): - """exclude links that are in blacklist""" - - links = [link for link in links if not URL_BLACKLIST.match(link['url'])] - - return links \ No newline at end of file +def exclude_blacklisted(links): + """exclude URLs that match the blacklisted url pattern regex""" + return (link for link in links if not URL_BLACKLIST.match(link['url'])) + + \ No newline at end of file From 066b36b6a9d75d9dc15060b1329a3a617250d576 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 14:56:19 -0400 Subject: [PATCH 5/6] make URL_BLACKLIST empty by default --- archivebox/config.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ec970a22..0d49a5d2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -47,7 +47,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -URL_BLACKLIST = os.getenv('URL_BLACKLIST', '.*youtube.com.*,.*facebook.com/.*,.*.exe') +URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) @@ -76,6 +76,8 @@ USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) +URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) + ########################### Environment & Dependencies ######################### try: @@ -268,11 +270,3 @@ except KeyboardInterrupt: except: print('[X] There was an error during the startup procedure, your archive data is unaffected.') raise - -URL_BLACKLIST = re.compile( - r'(.*\.youtube\.com)|' - r'(.*\.facebook\.com)|' - r'(.*\.amazon\.com)|' - r'(.*\.reddit\.com)', - re.IGNORECASE, - ) \ No newline at end of file From 529a0f8bb2655128b03b568d2fe41f506645fb9d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 15:00:21 -0400 Subject: [PATCH 6/6] fix broken function name --- archivebox/links.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/archivebox/links.py b/archivebox/links.py index 5eff61f4..0ec33fe5 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -37,7 +37,6 @@ def validate_links(links): links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = list(exclude_links(links)) # exclude URLs that match the blacklisted url pattern regex if not links: print('[X] No links found :(') @@ -52,11 +51,11 @@ def validate_links(links): def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" - return ( - link - for link in links - if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://')) - ) + for link in links: + scheme_is_valid = scheme(url) in ('http', 'https', 'ftp) + not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True + if scheme_is_valid and not_blacklisted: + yield link def uniquefied_links(sorted_links): @@ -119,9 +118,5 @@ def lowest_uniq_timestamp(used_timestamps, timestamp): new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp - -def exclude_blacklisted(links): - """exclude URLs that match the blacklisted url pattern regex""" - return (link for link in links if not URL_BLACKLIST.match(link['url'])) - \ No newline at end of file +