new generic_html parser for extracting hrefs

2025-05-13 06:34:25 -04:00 · 2020-08-18 08:29:05 -04:00 · 2020-08-18 08:29:05 -04:00 · 15efb2d5ed
commit 15efb2d5ed
parent a682a9c478
5 changed files with 106 additions and 39 deletions
--- a/archivebox/parsers/generic_html.py
+++ b/archivebox/parsers/generic_html.py
@ -0,0 +1,53 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable, Optional
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+    URL_REGEX,
+)
+from html.parser import HTMLParser
+from urllib.parse import urljoin
+
+
+class HrefParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.urls = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "a":
+            for attr, value in attrs:
+                if attr == "href":
+                    self.urls.append(value)
+
+
+@enforce_types
+def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
+    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
+
+    html_file.seek(0)
+    for line in html_file:
+        parser = HrefParser()
+        # example line
+        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
+        parser.feed(line)
+        for url in parser.urls:
+            if root_url:
+                # resolve relative urls /home.html -> https://example.com/home.html
+                url = urljoin(root_url, url)
+            
+            for archivable_url in re.findall(URL_REGEX, url):
+                yield Link(
+                    url=htmldecode(archivable_url),
+                    timestamp=str(datetime.now().timestamp()),
+                    title=None,
+                    tags=None,
+                    sources=[html_file.name],
+                )