mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-31 23:08:25 -04:00
yt-dlp flag cleanup
This commit is contained in:
commit
30947aeb07
12 changed files with 48 additions and 20 deletions
|
@ -146,10 +146,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--all-subs',
|
||||
# There are too many of these and youtube
|
||||
# throttles you with HTTP error 429
|
||||
#'--write-auto-sub',
|
||||
#'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
# This flag doesn't exist in youtube-dl
|
||||
# only in yt-dlp
|
||||
'--no-abort-on-error',
|
||||
# --ignore-errors must come AFTER
|
||||
# --no-abort-on-error
|
||||
|
@ -208,7 +210,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional, List, Iterable, Union
|
||||
|
@ -127,7 +128,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
else:
|
||||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
|
||||
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
|
||||
# are fixed.
|
||||
|
@ -137,14 +138,16 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
link.url,
|
||||
)) from e
|
||||
"""
|
||||
# Instead, use the kludgy workaround from
|
||||
# Instead, use the kludgy workaround from
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
command,
|
||||
ts
|
||||
) + "\n"))
|
||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
|||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'media'
|
||||
|
@ -61,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
|
@ -72,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
timer.end()
|
||||
|
||||
# add video description and subtitles to full-text index
|
||||
# Let's try a few different
|
||||
index_texts = [
|
||||
text_file.read_text(encoding='utf-8').strip()
|
||||
# errors:
|
||||
# * 'strict' to raise a ValueError exception if there is an
|
||||
# encoding error. The default value of None has the same effect.
|
||||
# * 'ignore' ignores errors. Note that ignoring encoding errors
|
||||
# can lead to data loss.
|
||||
# * 'xmlcharrefreplace' is only supported when writing to a
|
||||
# file. Characters not supported by the encoding are replaced with
|
||||
# the appropriate XML character reference &#nnn;.
|
||||
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
|
||||
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
|
||||
for text_file in (
|
||||
*output_path.glob('*.description'),
|
||||
*output_path.glob('*.srt'),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue