mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
abf75f49f4
commit
01ba6d49d3
115 changed files with 2466 additions and 2301 deletions
|
@ -8,8 +8,9 @@ from collections import defaultdict
|
|||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
|
|
@ -11,6 +11,9 @@ from archivebox.misc.util import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'output.html'
|
||||
|
@ -18,7 +21,6 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
|
|
|
@ -4,8 +4,9 @@ from pathlib import Path
|
|||
|
||||
from archivebox.misc.system import chmod_file, run
|
||||
from archivebox.misc.util import enforce_types, domain, dedupe
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
|
|
@ -13,10 +13,12 @@ from archivebox.misc.util import (
|
|||
without_query,
|
||||
without_fragment,
|
||||
)
|
||||
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
|
||||
from ..logging_util import TimedProgress
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
|
||||
from archivebox.plugins_extractor.git.config import GIT_CONFIG
|
||||
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'git/'
|
||||
|
|
|
@ -10,7 +10,8 @@ from archivebox.misc.util import (
|
|||
get_headers,
|
||||
dedupe,
|
||||
)
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
|
|
@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
from plugins_extractor.ytdlp.config import YTDLP_CONFIG
|
||||
from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
|
||||
|
||||
def get_output_path():
|
||||
return 'media/'
|
||||
|
@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
|
|||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
|||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
|
||||
# from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
|
||||
|
||||
YTDLP_BIN = YTDLP_BINARY.load()
|
||||
assert YTDLP_BIN.abspath and YTDLP_BIN.version
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ from archivebox.misc.util import (
|
|||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
|
||||
from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
|
||||
from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
|
||||
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
|
|
@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'output.pdf'
|
||||
|
@ -18,7 +21,6 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
|
|
|
@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
|
|||
from typing import Optional
|
||||
import json
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from archivebox.misc.system import run, atomic_write
|
||||
from archivebox.misc.util import enforce_types, is_static_file
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
from .title import get_html
|
||||
|
||||
from plugins_extractor.readability.config import READABILITY_CONFIG
|
||||
from plugins_extractor.readability.binaries import READABILITY_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'readability/'
|
||||
|
||||
|
@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
|
|||
|
||||
@enforce_types
|
||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.readability.apps import READABILITY_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
|
|||
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
|
||||
"""download reader friendly version using @mozilla/readability"""
|
||||
|
||||
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
|
||||
|
||||
READABILITY_BIN = READABILITY_BINARY.load()
|
||||
assert READABILITY_BIN.abspath and READABILITY_BIN.version
|
||||
|
||||
|
|
|
@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from archivebox.misc.util import enforce_types, is_static_file
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'screenshot.png'
|
||||
|
@ -15,7 +18,6 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
||||
|
|
|
@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
|
|||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
from plugins_extractor.chrome.config import CHROME_CONFIG
|
||||
from plugins_extractor.chrome.binaries import CHROME_BINARY
|
||||
from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
|
||||
from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'singlefile.html'
|
||||
|
@ -17,7 +22,6 @@ def get_output_path():
|
|||
|
||||
@enforce_types
|
||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
||||
return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||
"""download full site using single-file"""
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
|
||||
|
||||
CHROME_BIN = CHROME_BINARY.load()
|
||||
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||
|
|
|
@ -11,7 +11,9 @@ from archivebox.misc.util import (
|
|||
htmldecode,
|
||||
dedupe,
|
||||
)
|
||||
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
|
||||
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
|
||||
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@ from archivebox.misc.util import (
|
|||
urldecode,
|
||||
dedupe,
|
||||
)
|
||||
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
|
||||
|
||||
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
|
||||
from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
|
||||
from ..logging_util import TimedProgress
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue