move final legacy config to plugins and fix archivebox config cmd and add search opt

This commit is contained in:
Nick Sweeting 2024-10-21 02:56:00 -07:00
parent 115f89fd8b
commit b3107ab830
No known key found for this signature in database
20 changed files with 379 additions and 275 deletions

View file

@ -1,5 +1,6 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__id__ = 'chrome'
__label__ = 'Chrome'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
@ -11,13 +12,14 @@ import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
__id__: {
'id': __id__,
'package': __package__,
'label': __label__,
'version': __version__,
'author': __author__,
'homepage': __homepage__,
'dependencies': __dependencies__,
}
}
@ -26,7 +28,7 @@ def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
__id__: CHROME_CONFIG
}
@abx.hookimpl
@ -50,22 +52,3 @@ def ready():
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }
# Hooks Available:
# Events:
# on_crawl_schedule_tick
# on_seed_post_save
# on_crawl_post_save
# on_snapshot_post_save
# on_archiveresult_post_save
# create_root_snapshot_from_seed
# create_archiveresults_pending_from_snapshot
# create_crawl_from_crawlschedule_if_due
# create_crawl_copy_from_template
#
# create_crawl_from_crawlschedule_if_due

View file

@ -0,0 +1,41 @@
__package__ = 'plugins_extractor.htmltotext'
__id__ = 'htmltotext'
__label__ = 'HTML-to-Text'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
__id__: {
'id': __id__,
'package': __package__,
'label': __label__,
'version': __version__,
'author': __author__,
'homepage': __homepage__,
'dependencies': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import HTMLTOTEXT_CONFIG
return {
__id__: HTMLTOTEXT_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'htmltotext': FAVICON_EXTRACTOR,
# }

View file

@ -0,0 +1,11 @@
__package__ = 'plugins_extractor.htmltotext'
from abx.archivebox.base_configset import BaseConfigSet
class HtmltotextConfig(BaseConfigSet):
SAVE_HTMLTOTEXT: bool = True
HTMLTOTEXT_CONFIG = HtmltotextConfig()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.pocket'
__id__ = 'pocket'
__label__ = 'pocket'
__version__ = '2024.10.21'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/pocket'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
__id__: {
'id': __id__,
'package': __package__,
'label': __label__,
'version': __version__,
'author': __author__,
'homepage': __homepage__,
'dependencies': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import POCKET_CONFIG
return {
__id__: POCKET_CONFIG
}
@abx.hookimpl
def ready():
from .config import POCKET_CONFIG
POCKET_CONFIG.validate()

View file

@ -0,0 +1,15 @@
__package__ = 'plugins_extractor.pocket'
from typing import Dict
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
class PocketConfig(BaseConfigSet):
POCKET_CONSUMER_KEY: str | None = Field(default=None)
POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
POCKET_CONFIG = PocketConfig()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.readwise'
__id__ = 'readwise'
__label__ = 'readwise'
__version__ = '2024.10.21'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/readwise'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
__id__: {
'id': __id__,
'package': __package__,
'label': __label__,
'version': __version__,
'author': __author__,
'homepage': __homepage__,
'dependencies': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import READWISE_CONFIG
return {
__id__: READWISE_CONFIG
}
@abx.hookimpl
def ready():
from .config import READWISE_CONFIG
READWISE_CONFIG.validate()

View file

@ -0,0 +1,17 @@
__package__ = 'plugins_extractor.readwise'
from typing import Dict
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config import CONSTANTS
class ReadwiseConfig(BaseConfigSet):
READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db")
READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
READWISE_CONFIG = ReadwiseConfig()

View file

@ -14,7 +14,30 @@ class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_EXTRA_ARGS: List[str] = Field(default=lambda: [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(ARCHIVING_CONFIG.MEDIA_MAX_SIZE, ARCHIVING_CONFIG.MEDIA_MAX_SIZE),
], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)