replace uses of URL_REGEX with find_all_urls to handle markdown better

2025-05-23 03:06:55 -04:00 · 2024-04-24 17:45:45 -07:00 · 2024-04-24 17:45:45 -07:00 · beb3932d80
commit beb3932d80
parent 3afdd3d96f
4 changed files with 60 additions and 71 deletions
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -28,7 +28,7 @@ from ..util import (
    htmldecode,
    download_url,
    enforce_types,
-    URL_REGEX,
+    find_all_urls,
 )
 from ..index.schema import Link
 from ..logging_util import TimedProgress, log_source_saved
@ -202,54 +202,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
    log_source_saved(source_file=source_path)

    return source_path
-
-
-# Check that plain text regex URL parsing works as expected
-#   this is last-line-of-defense to make sure the URL_REGEX isn't
-#   misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
-#   the consequences of bad URL parsing could be disastrous and lead to many
-#   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
-_test_url_strs = {
-    'example.com': 0,
-    '/example.com': 0,
-    '//example.com': 0,
-    ':/example.com': 0,
-    '://example.com': 0,
-    'htt://example8.com': 0,
-    '/htt://example.com': 0,
-    'https://example': 1,
-    'https://localhost/2345': 1,
-    'https://localhost:1234/123': 1,
-    '://': 0,
-    'https://': 0,
-    'http://': 0,
-    'ftp://': 0,
-    'ftp://example.com': 0,
-    'https://example.com': 1,
-    'https://example.com/': 1,
-    'https://a.example.com': 1,
-    'https://a.example.com/': 1,
-    'https://a.example.com/what/is/happening.html': 1,
-    'https://a.example.com/what/ís/happening.html': 1,
-    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
-    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
-    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
-    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
-    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
-    '<test>http://example7.com</test>': 1,
-    'https://<test>': 0,
-    'https://[test]': 0,
-    'http://"test"': 0,
-    'http://\'test\'': 0,
-    '[https://example8.com/what/is/this.php?what=1]': 1,
-    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
-    '<what>https://example10.com#and-thing=2 "</about>': 1,
-    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
-    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
-    '<or>http://examplehttp://15.badc</that>': 2,
-    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
-    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
-}
-for url_str, num_urls in _test_url_strs.items():
-    assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
-        f'{url_str} does not contain {num_urls} urls')