mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
add dedicated shaarli rss parser
This commit is contained in:
parent
522b6288c1
commit
ca8f57ef5c
2 changed files with 44 additions and 1 deletions
|
@ -44,6 +44,7 @@ def get_parsers(file):
|
||||||
('bookmarks', parse_bookmarks_export),
|
('bookmarks', parse_bookmarks_export),
|
||||||
('rss', parse_rss_export),
|
('rss', parse_rss_export),
|
||||||
('pinboard_rss', parse_pinboard_rss_feed),
|
('pinboard_rss', parse_pinboard_rss_feed),
|
||||||
|
('shaarli_rss', parse_shaarli_rss_export),
|
||||||
('medium_rss', parse_medium_rss_feed),
|
('medium_rss', parse_medium_rss_feed),
|
||||||
('plain_text', parse_plain_text),
|
('plain_text', parse_plain_text),
|
||||||
])
|
])
|
||||||
|
@ -167,6 +168,48 @@ def parse_rss_export(rss_file):
|
||||||
|
|
||||||
yield info
|
yield info
|
||||||
|
|
||||||
|
def parse_shaarli_rss_export(rss_file):
|
||||||
|
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||||
|
|
||||||
|
rss_file.seek(0)
|
||||||
|
entries = rss_file.read().split('<entry>')[1:]
|
||||||
|
for entry in entries:
|
||||||
|
# example entry:
|
||||||
|
# <entry>
|
||||||
|
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
|
||||||
|
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
|
||||||
|
# <id>https://demo.shaarli.org/?cEV4vw</id>
|
||||||
|
# <published>2019-01-30T06:06:01+00:00</published>
|
||||||
|
# <updated>2019-01-30T06:06:01+00:00</updated>
|
||||||
|
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
|
||||||
|
# </entry>
|
||||||
|
|
||||||
|
trailing_removed = entry.split('</entry>', 1)[0]
|
||||||
|
leading_removed = trailing_removed.strip()
|
||||||
|
rows = leading_removed.split('\n')
|
||||||
|
|
||||||
|
def get_row(key):
|
||||||
|
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||||
|
|
||||||
|
title = str_between(get_row('title'), '<title>', '</title>').strip()
|
||||||
|
url = str_between(get_row('link'), '<link href="', '" />')
|
||||||
|
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||||
|
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
|
||||||
|
|
||||||
|
info = {
|
||||||
|
'url': url,
|
||||||
|
'domain': domain(url),
|
||||||
|
'base_url': base_url(url),
|
||||||
|
'timestamp': str(time.timestamp()),
|
||||||
|
'tags': '',
|
||||||
|
'title': title or fetch_page_title(url),
|
||||||
|
'sources': [rss_file.name],
|
||||||
|
}
|
||||||
|
info['type'] = get_link_type(info)
|
||||||
|
|
||||||
|
yield info
|
||||||
|
|
||||||
def parse_bookmarks_export(html_file):
|
def parse_bookmarks_export(html_file):
|
||||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||||
|
|
||||||
|
|
|
@ -233,8 +233,8 @@ def fetch_page_title(url, default=True):
|
||||||
default = url
|
default = url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
sys.stdout.write('.')
|
||||||
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
||||||
|
|
||||||
match = re.search('<title>(.*?)</title>', html_content)
|
match = re.search('<title>(.*?)</title>', html_content)
|
||||||
return match.group(1) if match else default or None
|
return match.group(1) if match else default or None
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue