From 5a2c78e14ba3fd2657b21d2455da0735a771aac1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 6 Jul 2021 23:42:00 -0400 Subject: [PATCH] add proper support for URL_WHITELIST instead of using negation regexes --- archivebox/config.py | 2 ++ archivebox/index/__init__.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 17b9960c..ba68e2a3 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -77,6 +77,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages + 'URL_WHITELIST': {'type': str, 'default': None}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, }, @@ -337,6 +338,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 198cc563..f631430c 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -23,6 +23,7 @@ from ..config import ( OUTPUT_DIR, TIMEOUT, URL_BLACKLIST_PTN, + URL_WHITELIST_PTN, stderr, OUTPUT_PERMISSIONS ) @@ -141,10 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]: continue if scheme(link.url) not in ('http', 'https', 'ftp'): continue - if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)): - # https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match - # we want both behaviors in order to support multiple patterns in the regex, - # and negation regexes like (?!someptnhere) to allow for whitelisting + if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): + continue + if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)): continue yield link