mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
move everything out of legacy folder
This commit is contained in:
parent
553f312125
commit
1b8abc0961
74 changed files with 3162 additions and 2629 deletions
35
archivebox/parsers/medium_rss.py
Normal file
35
archivebox/parsers/medium_rss.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item") # type: ignore
|
||||
for item in items:
|
||||
url = item.find("link").text # type: ignore
|
||||
title = item.find("title").text.strip() # type: ignore
|
||||
ts_str = item.find("pubDate").text # type: ignore
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue