mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
new generic_html parser for extracting hrefs
This commit is contained in:
parent
a682a9c478
commit
15efb2d5ed
5 changed files with 106 additions and 39 deletions
53
archivebox/parsers/generic_html.py
Normal file
53
archivebox/parsers/generic_html.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
)
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class HrefParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "a":
|
||||
for attr, value in attrs:
|
||||
if attr == "href":
|
||||
self.urls.append(value)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
html_file.seek(0)
|
||||
for line in html_file:
|
||||
parser = HrefParser()
|
||||
# example line
|
||||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
parser.feed(line)
|
||||
for url in parser.urls:
|
||||
if root_url:
|
||||
# resolve relative urls /home.html -> https://example.com/home.html
|
||||
url = urljoin(root_url, url)
|
||||
|
||||
for archivable_url in re.findall(URL_REGEX, url):
|
||||
yield Link(
|
||||
url=htmldecode(archivable_url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[html_file.name],
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue