mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-29 14:05:21 -04:00
add proper support for URL_WHITELIST instead of using negation regexes
This commit is contained in:
parent
e4974d3536
commit
5a2c78e14b
2 changed files with 6 additions and 4 deletions
|
@ -23,6 +23,7 @@ from ..config import (
|
|||
OUTPUT_DIR,
|
||||
TIMEOUT,
|
||||
URL_BLACKLIST_PTN,
|
||||
URL_WHITELIST_PTN,
|
||||
stderr,
|
||||
OUTPUT_PERMISSIONS
|
||||
)
|
||||
|
@ -141,10 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
|||
continue
|
||||
if scheme(link.url) not in ('http', 'https', 'ftp'):
|
||||
continue
|
||||
if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)):
|
||||
# https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match
|
||||
# we want both behaviors in order to support multiple patterns in the regex,
|
||||
# and negation regexes like (?!someptnhere) to allow for whitelisting
|
||||
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
|
||||
continue
|
||||
if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
|
||||
continue
|
||||
|
||||
yield link
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue