mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 16:14:28 -04:00
skip invalid urls at all stages
This commit is contained in:
parent
26022fc9fb
commit
225b63b732
1 changed files with 5 additions and 0 deletions
|
@ -9,6 +9,7 @@ from itertools import chain
|
||||||
from typing import List, Tuple, Dict, Optional, Iterable
|
from typing import List, Tuple, Dict, Optional, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
|
@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
for link in links:
|
for link in links:
|
||||||
|
try:
|
||||||
|
urlparse(link.url)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
|
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
|
||||||
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
|
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
|
||||||
if scheme_is_valid and not_blacklisted:
|
if scheme_is_valid and not_blacklisted:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue