mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
move everything out of legacy folder
This commit is contained in:
parent
553f312125
commit
1b8abc0961
74 changed files with 3162 additions and 2629 deletions
47
archivebox/parsers/pinboard_rss.py
Normal file
47
archivebox/parsers/pinboard_rss.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue