move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system

This commit is contained in:
Nick Sweeting 2024-09-25 00:42:26 -07:00
parent a2a586e369
commit a5ffd4e9d3
No known key found for this signature in database
11 changed files with 333 additions and 353 deletions

View file

@ -30,6 +30,7 @@ import inspect
import getpass import getpass
import shutil import shutil
import requests import requests
import archivebox
from hashlib import md5 from hashlib import md5
from pathlib import Path from pathlib import Path
@ -62,7 +63,6 @@ from .misc.logging import (
stderr, stderr,
hint, hint,
) )
from .misc.checks import check_system_config
# print('STARTING CONFIG LOADING') # print('STARTING CONFIG LOADING')
@ -167,15 +167,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'}, 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']},
'COOKIES_FILE': {'type': str, 'default': None}, 'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames', '--restrict-filenames',
@ -267,7 +260,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'}, 'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
@ -551,7 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME}, 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
@ -595,7 +587,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, # 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
@ -620,15 +612,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
@ -638,8 +621,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
} }
@ -1183,21 +1165,20 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_YOUTUBEDL'], 'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']), 'is_valid': bool(config['YOUTUBEDL_VERSION']),
}, },
'CHROME_BINARY': { # 'CHROME_BINARY': {
'path': bin_path(config['CHROME_BINARY']), # 'path': bin_path(config['CHROME_BINARY']),
'version': config['CHROME_VERSION'], # 'version': config['CHROME_VERSION'],
'hash': bin_hash(config['CHROME_BINARY']), # 'hash': bin_hash(config['CHROME_BINARY']),
'enabled': config['USE_CHROME'], # 'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']), # 'is_valid': bool(config['CHROME_VERSION']),
}, # },
'RIPGREP_BINARY': { # 'RIPGREP_BINARY': {
'path': bin_path(config['RIPGREP_BINARY']), # 'path': bin_path(config['RIPGREP_BINARY']),
'version': config['RIPGREP_VERSION'], # 'version': config['RIPGREP_VERSION'],
'hash': bin_hash(config['RIPGREP_BINARY']), # 'hash': bin_hash(config['RIPGREP_BINARY']),
'enabled': config['USE_RIPGREP'], # 'enabled': config['USE_RIPGREP'],
'is_valid': bool(config['RIPGREP_VERSION']), # 'is_valid': bool(config['RIPGREP_VERSION']),
}, # },
# TODO: add an entry for the sonic search backend?
# 'SONIC_BINARY': { # 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']), # 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'], # 'version': config['SONIC_VERSION'],
@ -1207,20 +1188,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
# }, # },
} }
def get_chrome_info(config: ConfigDict) -> ConfigValue:
return {
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
}
# ****************************************************************************** # ******************************************************************************
# ****************************************************************************** # ******************************************************************************
# ******************************** Load Config ********************************* # ******************************** Load Config *********************************
@ -1264,27 +1231,6 @@ os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # n
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors # add ./node_modules/.bin to $PATH so we can use node scripts in extractors
sys.path.append(CONFIG.NODE_BIN_PATH) sys.path.append(CONFIG.NODE_BIN_PATH)
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# get SQLite database version, compile options, and runtime options
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
########################### Config Validity Checkers ########################### ########################### Config Validity Checkers ###########################
@ -1308,13 +1254,19 @@ def bump_startup_progress_bar():
if INITIAL_STARTUP_PROGRESS: if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
def setup_django_minimal():
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
global INITIAL_STARTUP_PROGRESS global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK global INITIAL_STARTUP_PROGRESS_TASK
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
check_system_config(config)
output_dir = out_dir or Path(config['OUTPUT_DIR']) output_dir = out_dir or Path(config['OUTPUT_DIR'])

View file

@ -8,13 +8,6 @@ from ..system import run, chmod_file, atomic_write
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -25,6 +18,8 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -33,42 +28,48 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
if (out_dir / get_output_path()).stat().st_size > 1: if (out_dir / get_output_path()).stat().st_size > 1:
return False return False
return SAVE_DOM return CHROME_CONFIG.SAVE_DOM
@enforce_types @enforce_types
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
cmd = [ cmd = [
*chrome_args(), str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--dump-dom', '--dump-dom',
link.url link.url
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
atomic_write(output_path, result.stdout) atomic_write(output_path, result.stdout)
if result.returncode: if result.returncode:
hints = result.stderr.decode() hints = result.stderr
raise ArchiveError('Failed to save DOM', hints) raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=str(out_dir)) chmod_file(output, cwd=str(out_dir))
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err
chrome_cleanup() CHROME_BINARY.chrome_cleanup_lockfile()
finally: finally:
timer.end() timer.end()
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CHROME_VERSION, cmd_version=str(CHROME_BIN.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -5,20 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import enforce_types, is_static_file, dedupe
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_ARGS,
YOUTUBEDL_EXTRA_ARGS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -38,6 +25,8 @@ def get_embed_path(archiveresult=None):
@enforce_types @enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -45,45 +34,52 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_MEDIA return YTDLP_CONFIG.USE_YTDLP
@enforce_types @enforce_types
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
# from plugins_extractor.chrome.apps import CHROME_CONFIG
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
YTDLP_BIN = YTDLP_BINARY.load()
assert YTDLP_BIN.abspath and YTDLP_BIN.version
timeout = timeout or YTDLP_CONFIG.YTDLP_TIMEOUT
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
# later options take precedence # later options take precedence
options = [ options = [
*YOUTUBEDL_ARGS, *YTDLP_CONFIG.YTDLP_EXTRA_ARGS,
*YOUTUBEDL_EXTRA_ARGS, *([] if YTDLP_CONFIG.YTDLP_CHECK_SSL_VALIDITY else ['--no-check-certificate']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
] ]
cmd = [ cmd = [
YOUTUBEDL_BINARY, str(YTDLP_BIN.abspath),
*dedupe(options), *dedupe(options),
link.url, link.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, cwd=str(output_path), timeout=timeout + 1) result = run(cmd, cwd=str(output_path), timeout=timeout + 1, text=True)
chmod_file(output, cwd=str(out_dir)) chmod_file(output, cwd=str(out_dir))
if result.returncode: if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr if ('ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr or 'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr or 'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr or 'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr): or 'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console # These happen too frequently on non-media pages to warrant printing to console
pass pass
else: else:
hints = ( hints = (
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode), 'Got yt-dlp response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'), *result.stderr.split('\n'),
) )
raise ArchiveError('Failed to save media', hints) raise ArchiveError('Failed to save media', hints)
except Exception as err: except Exception as err:
@ -117,7 +113,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=YOUTUBEDL_VERSION, cmd_version=str(YTDLP_BIN.version),
output=output, output=output,
status=status, status=status,
index_texts=index_texts, index_texts=index_texts,

View file

@ -8,13 +8,6 @@ from ..system import run, chmod_file
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -25,6 +18,8 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -32,34 +27,40 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_PDF return CHROME_CONFIG.SAVE_PDF
@enforce_types @enforce_types
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
cmd = [ cmd = [
*chrome_args(), str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--print-to-pdf', '--print-to-pdf',
link.url, link.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save PDF', hints) raise ArchiveError('Failed to save PDF', hints)
chmod_file(get_output_path(), cwd=str(out_dir)) chmod_file(get_output_path(), cwd=str(out_dir))
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err
chrome_cleanup() CHROME_BINARY.chrome_cleanup_lockfile()
finally: finally:
timer.end() timer.end()
@ -67,7 +68,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CHROME_VERSION, cmd_version=str(CHROME_BINARY.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -5,17 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import enforce_types, is_static_file
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -25,6 +15,8 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -32,40 +24,45 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_SCREENSHOT return CHROME_CONFIG.SAVE_SCREENSHOT
@enforce_types @enforce_types
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
cmd = [ cmd = [
*chrome_args(), str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--screenshot', '--screenshot',
link.url, link.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode: if result.returncode:
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save screenshot', hints) raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=str(out_dir)) chmod_file(output, cwd=str(out_dir))
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err
chrome_cleanup() CHROME_BINARY.chrome_cleanup_lockfile()
finally: finally:
timer.end() timer.end()
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CHROME_VERSION, cmd_version=str(CHROME_BIN.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -7,22 +7,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, chmod_file from ..system import run, chmod_file
from ..util import ( from ..util import enforce_types, is_static_file, dedupe
enforce_types,
is_static_file,
chrome_args,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_SINGLEFILE,
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
COOKIES_FILE,
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -32,6 +17,8 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -39,30 +26,35 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_SINGLEFILE return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
@enforce_types @enforce_types
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""download full site using single-file""" """download full site using single-file"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
SINGLEFILE_BIN = SINGLEFILE_BINARY.load()
assert SINGLEFILE_BIN.abspath and SINGLEFILE_BIN.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output = get_output_path() output = get_output_path()
browser_args = chrome_args(CHROME_TIMEOUT=0) browser_args = CHROME_CONFIG.chrome_args(CHROME_TIMEOUT=0)
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [ options = [
'--browser-executable-path={}'.format(CHROME_BINARY), '--browser-executable-path={}'.format(CHROME_BIN.abspath),
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []), *(["--browser-cookies-file={}".format(SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE)] if SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE else []),
browser_args, '--browser-args={}'.format(json.dumps(browser_args)),
*SINGLEFILE_ARGS, *SINGLEFILE_CONFIG.SINGLEFILE_EXTRA_ARGS,
*SINGLEFILE_EXTRA_ARGS,
] ]
cmd = [ cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'], str(SINGLEFILE_BIN.abspath),
*dedupe(options), *dedupe(options),
link.url, link.url,
output, output,
@ -72,13 +64,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
result = None result = None
try: try:
result = run(cmd, cwd=str(out_dir), timeout=timeout) result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True, capture_output=True)
# parse out number of files downloaded from last line of stderr: # parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [ output_tail = [
line.strip() line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
if line.strip() if line.strip()
] ]
hints = ( hints = (
@ -93,9 +85,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
except (Exception, OSError) as err: except (Exception, OSError) as err:
status = 'failed' status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"") cmd[2] = cmd[2].replace('"', "\\\"")
if result: if result:
err.hints = (result.stdout + result.stderr).decode().split('\n') err.hints = (result.stdout + result.stderr).split('\n')
output = err output = err
finally: finally:
timer.end() timer.end()
@ -103,7 +95,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=SINGLEFILE_VERSION, cmd_version=str(SINGLEFILE_BIN.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -1,10 +1,10 @@
__package__ = 'archivebox' __package__ = 'archivebox'
import os import os
import time
import sys import sys
import shutil import shutil
import platform import platform
import archivebox
from typing import Dict, List, Optional, Iterable, IO, Union from typing import Dict, List, Optional, Iterable, IO, Union
from pathlib import Path from pathlib import Path
@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint from .misc.logging import stderr, hint
from .misc.checks import check_data_folder, check_dependencies from .misc.checks import check_data_folder, check_dependencies
from .config import ( from .config import (
setup_django_minimal,
ConfigDict, ConfigDict,
ANSI, ANSI,
IS_TTY, IS_TTY,
@ -81,8 +82,6 @@ from .config import (
TIMEZONE, TIMEZONE,
ENFORCE_ATOMIC_WRITES, ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS, OUTPUT_PERMISSIONS,
PYTHON_BINARY,
ARCHIVEBOX_BINARY,
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
SOURCES_DIR, SOURCES_DIR,
@ -95,31 +94,22 @@ from .config import (
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
SQL_INDEX_FILENAME, SQL_INDEX_FILENAME,
ALLOWED_IN_OUTPUT_DIR, ALLOWED_IN_OUTPUT_DIR,
SEARCH_BACKEND_ENGINE,
LDAP, LDAP,
get_version,
write_config_file, write_config_file,
VERSION, VERSION,
VERSIONS_AVAILABLE,
CAN_UPGRADE,
COMMIT_HASH, COMMIT_HASH,
BUILD_TIME, BUILD_TIME,
CODE_LOCATIONS, CODE_LOCATIONS,
DATA_LOCATIONS, DATA_LOCATIONS,
DEPENDENCIES, DEPENDENCIES,
CHROME_BINARY,
CHROME_VERSION,
YOUTUBEDL_BINARY, YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION, YOUTUBEDL_VERSION,
SINGLEFILE_VERSION, SINGLEFILE_VERSION,
READABILITY_VERSION, READABILITY_VERSION,
MERCURY_VERSION, MERCURY_VERSION,
NODE_VERSION,
load_all_config, load_all_config,
CONFIG, CONFIG,
USER_CONFIG, USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
get_real_name, get_real_name,
setup_django, setup_django,
) )
@ -216,6 +206,11 @@ def version(quiet: bool=False,
out_dir: Path=OUTPUT_DIR) -> None: out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox version and dependency information""" """Print the ArchiveBox version and dependency information"""
setup_django_minimal()
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
from plugins_auth.ldap.apps import LDAP_CONFIG
from django.conf import settings
print(VERSION) print(VERSION)
if not quiet: if not quiet:
@ -227,7 +222,7 @@ def version(quiet: bool=False,
p = platform.uname() p = platform.uname()
print( print(
'ArchiveBox v{}'.format(get_version(CONFIG)), 'ArchiveBox v{}'.format(archivebox.__version__),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={BUILD_TIME}', f'BUILD_TIME={BUILD_TIME}',
) )
@ -241,22 +236,22 @@ def version(quiet: bool=False,
) )
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print( print(
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}', f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_USER={PUID}:{PGID}', f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}', f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
) )
print( print(
f'DEBUG={DEBUG}', f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={IS_TTY}', f'IS_TTY={SHELL_CONFIG.IS_TTY}',
f'TZ={TIMEZONE}', f'TZ={TIMEZONE}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}', f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP}', f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
) )
print() print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) print('{white}[i] Old dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items(): for name, dependency in DEPENDENCIES.items():
print(printable_dependency_version(name, dependency)) print(printable_dependency_version(name, dependency))
@ -264,6 +259,12 @@ def version(quiet: bool=False,
if name == 'ARCHIVEBOX_BINARY': if name == 'ARCHIVEBOX_BINARY':
print() print()
print()
print('{white}[i] New dependency versions:{reset}'.format(**ANSI))
for name, binary in settings.BINARIES.items():
loaded_bin = binary.load()
print('', '' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath)
print() print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, path in CODE_LOCATIONS.items(): for name, path in CODE_LOCATIONS.items():
@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User from django.contrib.auth.models import User
from plugins_sys.config.apps import SERVER_CONFIG
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists(): if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI)) print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD) User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
if existing_index: if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]],
# tail_worker_logs(worker['stdout_logfile']) # tail_worker_logs(worker['stdout_logfile'])
if CAN_UPGRADE: # if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
return new_links return new_links
@ -967,6 +969,8 @@ def list_folders(links: List[Link],
def setup(out_dir: Path=OUTPUT_DIR) -> None: def setup(out_dir: Path=OUTPUT_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras""" """Automatically install all ArchiveBox dependencies and extras"""
if not (out_dir / ARCHIVE_DIR_NAME).exists(): if not (out_dir / ARCHIVE_DIR_NAME).exists():
run_subcommand('init', stdin=None, pwd=out_dir) run_subcommand('init', stdin=None, pwd=out_dir)
@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green') stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
from plugins_pkg.pip.apps import PYTHON_BINARY
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...') stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
if YOUTUBEDL_VERSION: if YOUTUBEDL_VERSION:
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY) print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
else: else:
try: try:
run_shell([ run_shell([
PYTHON_BINARY, '-m', 'pip', PYTHON_BINARY.load().abspath, '-m', 'pip',
'install', 'install',
'--upgrade', '--upgrade',
'--no-cache-dir', '--no-cache-dir',
'--no-warn-script-location', '--no-warn-script-location',
'yt-dlp', 'yt-dlp',
], capture_output=False, cwd=out_dir) ], capture_output=False, cwd=out_dir, text=True)
pkg_path = run_shell([ pkg_path = run_shell([
PYTHON_BINARY, '-m', 'pip', PYTHON_BINARY.load().abspath, '-m', 'pip',
'show', 'show',
'yt-dlp', 'yt-dlp',
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0] ], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0]
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py' NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777) os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}' assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr(f'[X] Failed to install python packages: {e}', color='red') stderr(f'[X] Failed to install python packages: {e}', color='red')
raise SystemExit(1) raise SystemExit(1)
if platform.machine() == 'armv7l':
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.') from plugins_extractor.chrome.apps import CHROME_BINARY
else:
stderr('\n Installing CHROME_BINARY automatically using playwright...') CHROME_BINARY.load_or_install()
if CHROME_VERSION:
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY) from plugins_pkg.npm.apps import NPM_BINARY
else: from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
try:
run_shell([ SINGLEFILE_BINARY.load_or_install()
PYTHON_BINARY, '-m', 'pip',
'install',
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
'playwright',
], capture_output=False, cwd=out_dir)
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
raise SystemExit(1)
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...') stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
if not NODE_VERSION: if not NPM_BINARY.load().version:
stderr('[X] You must first install node & npm using your system package manager', color='red') stderr('[X] You must first install node & npm using your system package manager', color='red')
hint([ hint([
'https://github.com/nodesource/distributions#table-of-contents', 'https://github.com/nodesource/distributions#table-of-contents',
@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir) from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
@enforce_types @enforce_types
def config(config_options_str: Optional[str]=None, def config(config_options_str: Optional[str]=None,
@ -1192,6 +1185,8 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron""" """Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder(CONFIG) check_data_folder(CONFIG)
setup_django_minimal()
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
Path(LOGS_DIR).mkdir(exist_ok=True) Path(LOGS_DIR).mkdir(exist_ok=True)
@ -1212,7 +1207,7 @@ def schedule(add: bool=False,
'cd', 'cd',
quoted(out_dir), quoted(out_dir),
'&&', '&&',
quoted(ARCHIVEBOX_BINARY), quoted(ARCHIVEBOX_BINARY.load().abspath),
*([ *([
'add', 'add',
*(['--overwrite'] if overwrite else []), *(['--overwrite'] if overwrite else []),
@ -1300,8 +1295,8 @@ def schedule(add: bool=False,
print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1) raise SystemExit(1)
if CAN_UPGRADE: # if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
@enforce_types @enforce_types
@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
"""Run an ArchiveBox Django management command""" """Run an ArchiveBox Django management command"""
check_data_folder(CONFIG) check_data_folder(CONFIG)
setup_django_minimal()
from django.core.management import execute_from_command_line from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('') stderr('')
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])]) from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])])
@enforce_types @enforce_types

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.plugins_extractor.chrome' __package__ = 'archivebox.plugins_extractor.chrome'
import sys
import platform import platform
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, ClassVar from typing import List, Optional, Dict, ClassVar
@ -7,7 +8,8 @@ from typing import List, Optional, Dict, ClassVar
from django.conf import settings from django.conf import settings
# Depends on other PyPI/vendor packages: # Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import ( from pydantic_pkgr import (
BinProvider, BinProvider,
BinName, BinName,
@ -25,9 +27,12 @@ from plugantic.base_binary import BaseBinary, env
from plugantic.base_hook import BaseHook from plugantic.base_hook import BaseHook
# Depends on Other Plugins: # Depends on Other Plugins:
from plugins_sys.config.apps import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from ...util import dedupe
CHROMIUM_BINARY_NAMES_LINUX = [ CHROMIUM_BINARY_NAMES_LINUX = [
"chromium", "chromium",
@ -82,11 +87,113 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
class ChromeConfig(BaseConfigSet): class ChromeConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
CHROME_BINARY: str = Field(default='chrome') USE_CHROME: bool = Field(default=True)
CHROME_ARGS: List[str] | None = Field(default=None)
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
# Chrome Binary
CHROME_BINARY: str = Field(default='chrome')
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
# Chrome Options Tuning
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
CHROME_HEADLESS: bool = Field(default=True)
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
# Cookies & Auth
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CHROME_USER_DATA_DIR: Path | None = Field(default=None)
CHROME_PROFILE_NAME: str = Field(default='Default')
# Extractor Toggles
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists():
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
else:
self.CHROME_USER_DATA_DIR = None
return self
def chrome_args(self, **options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
options = self.model_copy(update=options)
cmd_args = [*options.CHROME_EXTRA_ARGS]
if options.CHROME_HEADLESS:
cmd_args += ["--headless=new"] # expects chrome version >= 111
if not options.CHROME_SANDBOX:
# assume this means we are running inside a docker container
# in docker, GPU support is limited, sandboxing is unecessary,
# and SHM is limited to 64MB by default (which is too low to be usable).
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
if not options.CHROME_CHECK_SSL_VALIDITY:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if options.CHROME_USER_AGENT:
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
if options.CHROME_TIMEOUT:
cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
if options.CHROME_USER_DATA_DIR:
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME))
return dedupe(cmd_args)
CHROME_CONFIG = ChromeConfig() CHROME_CONFIG = ChromeConfig()
@ -122,6 +229,18 @@ class ChromeBinary(BaseBinary):
# otherwise on linux we can symlink directly to binary executable # otherwise on linux we can symlink directly to binary executable
symlink.symlink_to(binary.abspath) symlink.symlink_to(binary.abspath)
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock")
if SHELL_CONFIG.IN_DOCKER and lock_file.exists():
lock_file.unlink()
CHROME_BINARY = ChromeBinary() CHROME_BINARY = ChromeBinary()

View file

@ -24,40 +24,21 @@ from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ########################## ###################### Config ##########################
class SinglefileToggleConfigs(BaseConfigSet): class SinglefileConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES' section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
SAVE_SINGLEFILE: bool = True SAVE_SINGLEFILE: bool = True
class SinglefileOptionsConfigs(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS'
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
class SinglefileDependencyConfigs(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
SINGLEFILE_BINARY: str = Field(default='wget') SINGLEFILE_BINARY: str = Field(default='wget')
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
SINGLEFILE_EXTRA_ARGS: List[str] = [] SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs):
# section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS'
pass
DEFAULT_GLOBAL_CONFIG = { SINGLEFILE_CONFIG = SinglefileConfig()
'CHECK_SSL_VALIDITY': False,
'SAVE_SINGLEFILE': True,
'TIMEOUT': 120,
}
SINGLEFILE_CONFIG = SinglefileConfigs(**DEFAULT_GLOBAL_CONFIG)
SINGLEFILE_MIN_VERSION = '1.1.54' SINGLEFILE_MIN_VERSION = '1.1.54'

View file

@ -1,6 +1,8 @@
import sys
from typing import List, Dict, ClassVar from typing import List, Dict, ClassVar
from subprocess import run, PIPE from subprocess import run, PIPE
from pydantic import InstanceOf, Field
from pydantic import InstanceOf, Field, model_validator, AliasChoices
from django.conf import settings from django.conf import settings
@ -10,20 +12,37 @@ from plugantic.base_configset import BaseConfigSet, ConfigSectionName
from plugantic.base_binary import BaseBinary, env, apt, brew from plugantic.base_binary import BaseBinary, env, apt, brew
from plugantic.base_hook import BaseHook from plugantic.base_hook import BaseHook
from plugins_sys.config.apps import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip from plugins_pkg.pip.apps import pip
###################### Config ########################## ###################### Config ##########################
class YtdlpDependencyConfigs(BaseConfigSet): class YtdlpConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
USE_YTDLP: bool = True USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp') YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
DEFAULT_GLOBAL_CONFIG = {} YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_CONFIG = YtdlpDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
YTDLP_CONFIG = YtdlpConfig()
@ -31,6 +50,9 @@ class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary): class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg' name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
@ -53,10 +75,9 @@ class FfmpegBinary(BaseBinary):
# def get_ffmpeg_version(self) -> Optional[str]: # def get_ffmpeg_version(self) -> Optional[str]:
# return self.exec(cmd=['-version']).stdout # return self.exec(cmd=['-version']).stdout
YTDLP_BINARY = YtdlpBinary()
FFMPEG_BINARY = FfmpegBinary() FFMPEG_BINARY = FfmpegBinary()
# class YtdlpExtractor(BaseExtractor): # class YtdlpExtractor(BaseExtractor):
# name: str = 'ytdlp' # name: str = 'ytdlp'
# binary: str = 'ytdlp' # binary: str = 'ytdlp'

View file

@ -18,8 +18,6 @@ from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode # type: ignore from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
from os.path import lexists
from os import remove as remove_file
try: try:
import chardet import chardet
@ -282,82 +280,6 @@ def get_headers(url: str, timeout: int=None) -> str:
) )
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import (
CHROME_OPTIONS,
CHROME_VERSION,
CHROME_EXTRA_ARGS,
)
options = {**CHROME_OPTIONS, **options}
if not options['CHROME_BINARY']:
raise Exception('Could not find any CHROME_BINARY installed on your system')
cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
cmd_args += ("--headless=new",) # expects chrome version >= 111
if not options['CHROME_SANDBOX']:
# assume this means we are running inside a docker container
# in docker, GPU support is limited, sandboxing is unecessary,
# and SHM is limited to 64MB by default (which is too low to be usable).
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if options['CHROME_TIMEOUT']:
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
cmd_args.append('--profile-directory=Default')
return dedupe(cmd_args)
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
from .config import IN_DOCKER
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
remove_file("/home/archivebox/.config/chromium/SingletonLock")
@enforce_types @enforce_types
def ansi_to_html(text: str) -> str: def ansi_to_html(text: str) -> str:
""" """