rename vendor dir to pkgs

2025-05-16 16:14:28 -04:00 · 2024-10-28 20:05:20 -07:00 · 2024-10-28 20:05:20 -07:00 · dee4eb7992
commit dee4eb7992
parent 7d75867650
168 changed files with 47 additions and 54 deletions
--- a/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/readability.py
+++ b/archivebox/vendor/abx-plugin-readability/abx_plugin_readability/readability.py
@ -1,118 +0,0 @@
-__package__ = 'archivebox.extractors'
-
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-
-from typing import Optional
-import json
-
-from archivebox.misc.system import run, atomic_write
-from archivebox.misc.util import enforce_types, is_static_file
-from ..index.schema import Link, ArchiveResult, ArchiveError
-from ..logging_util import TimedProgress
-from .title import get_html
-
-from plugins_extractor.readability.config import READABILITY_CONFIG
-from plugins_extractor.readability.binaries import READABILITY_BINARY
-
-
-def get_output_path():
-    return 'readability/'
-
-def get_embed_path(archiveresult=None):
-    return get_output_path() + 'content.html'
-
-
-@enforce_types
-def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    
-    if is_static_file(link.url):
-        return False
-
-    output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
-    if not overwrite and output_subdir.exists():
-        return False
-
-    return READABILITY_CONFIG.SAVE_READABILITY
-
-
-@enforce_types
-def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
-    """download reader friendly version using @mozilla/readability"""
-    
-    READABILITY_BIN = READABILITY_BINARY.load()
-    assert READABILITY_BIN.abspath and READABILITY_BIN.version
-
-    timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
-    output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
-    output = get_output_path()
-
-    # Readability Docs: https://github.com/mozilla/readability
-
-    status = 'succeeded'
-    # fake command to show the user so they have something to try debugging if get_html fails
-    cmd = [
-        str(READABILITY_BIN.abspath),
-        '{dom,singlefile}.html',
-        link.url,
-    ]
-    readability_content = None
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        document = get_html(link, Path(out_dir or link.link_dir))
-        temp_doc = NamedTemporaryFile(delete=False)
-        temp_doc.write(document.encode("utf-8"))
-        temp_doc.close()
-
-        if not document or len(document) < 10:
-            raise ArchiveError('Readability could not find HTML to parse for article text')
-
-        cmd = [
-            str(READABILITY_BIN.abspath),
-            temp_doc.name,
-            link.url,
-        ]
-        result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
-        try:
-            result_json = json.loads(result.stdout)
-            assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
-        except json.JSONDecodeError:
-            raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
-
-        output_subdir.mkdir(exist_ok=True)
-        readability_content = result_json.pop("textContent") 
-        atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_subdir / "content.txt"), readability_content)
-        atomic_write(str(output_subdir / "article.json"), result_json)
-
-        output_tail = [
-            line.strip()
-            for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
-            if line.strip()
-        ]
-        hints = (
-            'Got readability response code: {}.'.format(result.returncode),
-            *output_tail,
-        )
-
-        # Check for common failure cases
-        if (result.returncode > 0):
-            raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints)
-    except (Exception, OSError) as err:
-        status = 'failed'
-        output = err
-
-        # prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability
-        cmd = [cmd[0], './{dom,singlefile}.html']
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=str(out_dir),
-        cmd_version=str(READABILITY_BIN.version),
-        output=output,
-        status=status,
-        index_texts=[readability_content] if readability_content else [],
-        **timer.stats,  
-    )