yt-dlp flag cleanup

This commit is contained in:
Joseph Turian 2022-09-14 06:29:57 +02:00
commit 30947aeb07
12 changed files with 48 additions and 20 deletions

View file

@ -146,10 +146,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--all-subs',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-sub',
#'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
# --ignore-errors must come AFTER
# --no-abort-on-error
@ -208,7 +210,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.extractors'
import os
import sys
from pathlib import Path
from typing import Optional, List, Iterable, Union
@ -127,7 +128,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
else:
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
except Exception:
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
# are fixed.
@ -137,14 +138,16 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
link.url,
)) from e
"""
# Instead, use the kludgy workaround from
# Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={}))'.format(
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
method_name,
link.url,
command,
ts
) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")

View file

@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
@enforce_types
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'media'
@ -61,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
@ -72,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
timer.end()
# add video description and subtitles to full-text index
# Let's try a few different
index_texts = [
text_file.read_text(encoding='utf-8').strip()
# errors:
# * 'strict' to raise a ValueError exception if there is an
# encoding error. The default value of None has the same effect.
# * 'ignore' ignores errors. Note that ignoring encoding errors
# can lead to data loss.
# * 'xmlcharrefreplace' is only supported when writing to a
# file. Characters not supported by the encoding are replaced with
# the appropriate XML character reference &#nnn;.
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
for text_file in (
*output_path.glob('*.description'),
*output_path.glob('*.srt'),