move abx plugins inside vendor dir

2025-06-04 08:29:49 -04:00 · 2024-10-28 04:07:35 -07:00 · 2024-10-28 04:07:35 -07:00 · b3c1cb716e
commit b3c1cb716e
parent 5d9a32c364
242 changed files with 2153 additions and 2700 deletions
--- a/archivebox/vendor/abx-plugin-readability/README.md
+++ b/archivebox/vendor/abx-plugin-readability/README.md
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/init.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/init.py
@ -0,0 +1,30 @@
+__package__ = 'abx_plugin_readability'
+__label__ = 'Readability'
+__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import READABILITY_CONFIG
+    
+    return {
+        'READABILITY_CONFIG': READABILITY_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import READABILITY_BINARY
+    
+    return {
+        'readability': READABILITY_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import READABILITY_EXTRACTOR
+    
+    return {
+        'readability': READABILITY_EXTRACTOR,
+    }
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/binaries.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/binaries.py
@ -0,0 +1,26 @@
+__package__ = 'abx_plugin_readability'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import Binary, BinProvider, BinaryOverrides, BinName
+
+from abx_plugin_default_binproviders import env
+from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import READABILITY_CONFIG
+
+
+READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
+
+class ReadabilityBinary(Binary):
+    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
+        SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None},    # prevent modifying system global npm packages
+    }
+
+
+READABILITY_BINARY = ReadabilityBinary()
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/config.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/config.py
@ -0,0 +1,17 @@
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class ReadabilityConfig(BaseConfigSet):
+    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
+
+    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+
+    READABILITY_BINARY: str = Field(default='readability-extractor')
+    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
+
+
+READABILITY_CONFIG = ReadabilityConfig()
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/extractors.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/extractors.py
@ -0,0 +1,19 @@
+# __package__ = 'abx_plugin_readability'
+
+# from pathlib import Path
+
+# from pydantic_pkgr import BinName
+
+
+# from .binaries import READABILITY_BINARY
+
+
+# class ReadabilityExtractor(BaseExtractor):
+#     name: str = 'readability'
+#     binary: BinName = READABILITY_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path:
+#         return Path(snapshot.link_dir) / 'readability' / 'content.html'
+
+
+# READABILITY_EXTRACTOR = ReadabilityExtractor()
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/readability.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/readability.py
@ -0,0 +1,118 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+
+from typing import Optional
+import json
+
+from archivebox.misc.system import run, atomic_write
+from archivebox.misc.util import enforce_types, is_static_file
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from ..logging_util import TimedProgress
+from .title import get_html
+
+from plugins_extractor.readability.config import READABILITY_CONFIG
+from plugins_extractor.readability.binaries import READABILITY_BINARY
+
+
+def get_output_path():
+    return 'readability/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+
+
+@enforce_types
+def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    
+    if is_static_file(link.url):
+        return False
+
+    output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
+    if not overwrite and output_subdir.exists():
+        return False
+
+    return READABILITY_CONFIG.SAVE_READABILITY
+
+
+@enforce_types
+def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
+    """download reader friendly version using @mozilla/readability"""
+    
+    READABILITY_BIN = READABILITY_BINARY.load()
+    assert READABILITY_BIN.abspath and READABILITY_BIN.version
+
+    timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
+    output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
+    output = get_output_path()
+
+    # Readability Docs: https://github.com/mozilla/readability
+
+    status = 'succeeded'
+    # fake command to show the user so they have something to try debugging if get_html fails
+    cmd = [
+        str(READABILITY_BIN.abspath),
+        '{dom,singlefile}.html',
+        link.url,
+    ]
+    readability_content = None
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        document = get_html(link, Path(out_dir or link.link_dir))
+        temp_doc = NamedTemporaryFile(delete=False)
+        temp_doc.write(document.encode("utf-8"))
+        temp_doc.close()
+
+        if not document or len(document) < 10:
+            raise ArchiveError('Readability could not find HTML to parse for article text')
+
+        cmd = [
+            str(READABILITY_BIN.abspath),
+            temp_doc.name,
+            link.url,
+        ]
+        result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
+        try:
+            result_json = json.loads(result.stdout)
+            assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
+        except json.JSONDecodeError:
+            raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
+
+        output_subdir.mkdir(exist_ok=True)
+        readability_content = result_json.pop("textContent") 
+        atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
+        atomic_write(str(output_subdir / "content.txt"), readability_content)
+        atomic_write(str(output_subdir / "article.json"), result_json)
+
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
+            if line.strip()
+        ]
+        hints = (
+            'Got readability response code: {}.'.format(result.returncode),
+            *output_tail,
+        )
+
+        # Check for common failure cases
+        if (result.returncode > 0):
+            raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints)
+    except (Exception, OSError) as err:
+        status = 'failed'
+        output = err
+
+        # prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability
+        cmd = [cmd[0], './{dom,singlefile}.html']
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(READABILITY_BIN.version),
+        output=output,
+        status=status,
+        index_texts=[readability_content] if readability_content else [],
+        **timer.stats,  
+    )
--- a/archivebox/vendor/abx-plugin-readability/pyproject.toml
+++ b/archivebox/vendor/abx-plugin-readability/pyproject.toml
@ -0,0 +1,17 @@
+[project]
+name = "abx-plugin-readability"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_readability = "abx_plugin_readability"