mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
add plain text link parsing
This commit is contained in:
parent
7a9487fad9
commit
cf9d1875c7
2 changed files with 55 additions and 10 deletions
|
@ -42,6 +42,8 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
|||
|
||||
short_ts = lambda ts: ts.split('.')[0]
|
||||
|
||||
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
||||
|
@ -208,6 +210,19 @@ def download_url(url):
|
|||
|
||||
return source_path
|
||||
|
||||
|
||||
def fetch_page_title(url, default=None):
|
||||
"""Attempt to guess a page's title by downloading the html"""
|
||||
|
||||
try:
|
||||
html_content = urllib.request.urlopen(url).read().decode('utf-8')
|
||||
|
||||
match = re.search('<title>(.*?)</title>', html_content)
|
||||
return match.group(1) if match else default
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def str_between(string, start, end=None):
|
||||
"""(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue