mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 23:24:30 -04:00
Merge branch 'master' into dev
This commit is contained in:
commit
5d0185b6dd
6 changed files with 41 additions and 8 deletions
|
@ -8,6 +8,9 @@ from .util import (
|
|||
merge_links,
|
||||
)
|
||||
|
||||
from config import (
|
||||
URL_BLACKLIST,
|
||||
)
|
||||
|
||||
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||
|
@ -22,11 +25,11 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
|
|||
|
||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||
return (
|
||||
link
|
||||
for link in links
|
||||
if scheme(link.url) in ('http', 'https', 'ftp')
|
||||
)
|
||||
for link in links:
|
||||
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
|
||||
not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True
|
||||
if scheme_is_valid and not_blacklisted:
|
||||
yield link
|
||||
|
||||
|
||||
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||
|
@ -87,3 +90,5 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
|
|||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
||||
|
||||
return new_timestamp
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue