load EXTRACTORS dynamically using importlib.import_module

2025-05-21 18:35:23 -04:00 · 2024-05-11 22:28:59 -07:00 · 2024-05-11 22:28:59 -07:00 · 457c42bf84
commit 457c42bf84
parent c7f55fc3ba
18 changed files with 198 additions and 40 deletions
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
        if tag.lower() == "title":
            self.inside_title_tag = False

+
@enforce_types
 def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
    """
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
    else:
        return document

+
+def get_output_path():
+    # TODO: actually save title to this file
+    # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
+    return 'title.json'
+
+
@enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
    # if link already has valid title, skip it