mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
Use feedparser for RSS parsing in generic_rss and pinboard_rss parsers
The feedparser packages has 20 years of history and is very good at parsing RSS and Atom, so use that instead of ad-hoc regex and XML parsing. The medium_rss and shaarli_rss parsers weren't touched because they are probably unnecessary. (The special parse for pinboard is just needing because of how tags work.) Doesn't include tests because I haven't figured out how to run them in the docker development setup. Fixes #1171
This commit is contained in:
parent
7b042c854a
commit
9f462a87a8
3 changed files with 34 additions and 50 deletions
|
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from time import mktime
|
||||||
|
from feedparser import parse as feedparser
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types
|
||||||
str_between,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse RSS XML-format files into links"""
|
"""Parse RSS XML-format files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
items = rss_file.read().split('<item>')
|
feed = feedparser(rss_file.read())
|
||||||
items = items[1:] if items else []
|
for item in feed.entries:
|
||||||
for item in items:
|
url = item.link
|
||||||
# example item:
|
title = item.title
|
||||||
# <item>
|
time = mktime(item.updated_parsed)
|
||||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
|
||||||
# <category>Unread</category>
|
|
||||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
|
||||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
|
||||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
|
||||||
# </item>
|
|
||||||
|
|
||||||
trailing_removed = item.split('</item>', 1)[0]
|
try:
|
||||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
tags = ','.join(map(lambda tag: tag.term, item.tags))
|
||||||
rows = leading_removed.split('\n')
|
except AttributeError:
|
||||||
|
tags = ''
|
||||||
|
|
||||||
def get_row(key):
|
if url is None:
|
||||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
# Yielding a Link with no URL will
|
||||||
|
# crash on a URL validation assertion
|
||||||
url = str_between(get_row('link'), '<link>', '</link>')
|
continue
|
||||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
|
||||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
|
||||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
|
||||||
|
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(time.timestamp()),
|
timestamp=str(time),
|
||||||
title=htmldecode(title) or None,
|
title=htmldecode(title) or None,
|
||||||
tags=None,
|
tags=tags,
|
||||||
sources=[rss_file.name],
|
sources=[rss_file.name],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
from time import mktime
|
||||||
|
from feedparser import parse as feedparser
|
||||||
from xml.etree import ElementTree
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse Pinboard RSS feed files into links"""
|
"""Parse Pinboard RSS feed files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
root = ElementTree.parse(rss_file).getroot()
|
feed = feedparser(rss_file.read())
|
||||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
for item in feed.entries:
|
||||||
for item in items:
|
url = item.link
|
||||||
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
|
# title will start with "[priv] " if pin was marked private. useful?
|
||||||
|
title = item.title
|
||||||
|
time = mktime(item.updated_parsed)
|
||||||
|
|
||||||
url = find("{http://purl.org/rss/1.0/}link")
|
# all tags are in one entry.tags with spaces in it. annoying!
|
||||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
try:
|
||||||
title = find("{http://purl.org/rss/1.0/}title")
|
tags = item.tags[0].term.replace(' ', ',')
|
||||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
except AttributeError:
|
||||||
|
tags = ''
|
||||||
|
|
||||||
if url is None:
|
if url is None:
|
||||||
# Yielding a Link with no URL will
|
# Yielding a Link with no URL will
|
||||||
# crash on a URL validation assertion
|
# crash on a URL validation assertion
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
|
||||||
# Python can't parse. Remove it:
|
|
||||||
if ts_str and ts_str[-3:-2] == ":":
|
|
||||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
|
||||||
|
|
||||||
if ts_str:
|
|
||||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
|
||||||
else:
|
|
||||||
time = datetime.now(timezone.utc)
|
|
||||||
|
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(time.timestamp()),
|
timestamp=str(time),
|
||||||
title=htmldecode(title) or None,
|
title=htmldecode(title) or None,
|
||||||
tags=htmldecode(tags) or None,
|
tags=htmldecode(tags) or None,
|
||||||
sources=[rss_file.name],
|
sources=[rss_file.name],
|
||||||
|
|
|
@ -15,6 +15,7 @@ dependencies = [
|
||||||
"dateparser>=1.0.0",
|
"dateparser>=1.0.0",
|
||||||
"django-extensions>=3.0.3",
|
"django-extensions>=3.0.3",
|
||||||
"django>=3.1.3,<3.2",
|
"django>=3.1.3,<3.2",
|
||||||
|
"feedparser>=6.0.11",
|
||||||
"ipython>5.0.0",
|
"ipython>5.0.0",
|
||||||
"mypy-extensions>=0.4.3",
|
"mypy-extensions>=0.4.3",
|
||||||
"python-crontab>=2.5.1",
|
"python-crontab>=2.5.1",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue