yt-dlp flag cleanup

2025-05-31 23:08:25 -04:00 · 2022-09-14 06:29:57 +02:00 · 2022-09-14 06:29:57 +02:00 · 30947aeb07
commit 30947aeb07
parent f729bbe122 caa8b782fb
12 changed files with 48 additions and 20 deletions
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -146,10 +146,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--all-subs',
                                                                # There are too many of these and youtube
                                                                # throttles you with HTTP error 429
-                                                                #'--write-auto-sub',
+                                                                #'--write-auto-subs',
                                                                '--convert-subs=srt',
                                                                '--yes-playlist',
                                                                '--continue',
+                                                                # This flag doesn't exist in youtube-dl
+                                                                # only in yt-dlp
                                                                '--no-abort-on-error',
                                                                # --ignore-errors must come AFTER
                                                                # --no-abort-on-error
@ -208,7 +210,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('mercury-parser')},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
+        #'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
+        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
        'CHROME_BINARY':            {'type': str,   'default': None},
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -1,6 +1,7 @@
 __package__ = 'archivebox.extractors'

 import os
+import sys
 from pathlib import Path

 from typing import Optional, List, Iterable, Union
@ -127,7 +128,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                else:
                    # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                    stats['skipped'] += 1
-            except Exception as e:
+            except Exception:
                # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
                # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
                # are fixed.
@ -137,14 +138,16 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    link.url,
                )) from e
                """
-		        # Instead, use the kludgy workaround from
+                # Instead, use the kludgy workaround from
                # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
                with open(ERROR_LOG, "a", encoding='utf-8') as f:
                    command = ' '.join(sys.argv)
                    ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                    f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={}))'.format(
+                    f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
                        method_name,
                        link.url,
+                        command,
+                        ts
                    ) + "\n"))
                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")

--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio

@enforce_types
 def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
-    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
+    """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""

    out_dir = out_dir or Path(link.link_dir)
    output: ArchiveOutput = 'media'
@ -61,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
                pass
            else:
                hints = (
-                    'Got youtube-dl response code: {}.'.format(result.returncode),
+                    'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to save media', hints)
@ -72,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
        timer.end()

    # add video description and subtitles to full-text index
+    # Let's try a few different 
    index_texts = [
-        text_file.read_text(encoding='utf-8').strip()
+        # errors:
+        # * 'strict' to raise a ValueError exception if there is an
+        #   encoding error. The default value of None has the same effect.
+        # * 'ignore' ignores errors. Note that ignoring encoding errors
+        #   can lead to data loss.
+        # * 'xmlcharrefreplace' is only supported when writing to a
+        #   file. Characters not supported by the encoding are replaced with
+        #   the appropriate XML character reference &#nnn;.
+        # There are a few more options described in https://docs.python.org/3/library/functions.html#open
+        text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
        for text_file in (
            *output_path.glob('*.description'),
            *output_path.glob('*.srt'),