move abx plugins inside vendor dir

2025-06-02 15:49:51 -04:00 · 2024-10-28 04:07:35 -07:00 · 2024-10-28 04:07:35 -07:00 · b3c1cb716e
commit b3c1cb716e
parent 5d9a32c364
242 changed files with 2153 additions and 2700 deletions
--- a/archivebox/vendor/abx-plugin-wget/README.md
+++ b/archivebox/vendor/abx-plugin-wget/README.md
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/init.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/init.py
@ -0,0 +1,35 @@
+__package__ = 'abx_plugin_wget'
+__label__ = 'WGET'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import WGET_CONFIG
+        
+    return {
+        'WGET_CONFIG': WGET_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import WGET_BINARY
+    
+    return {
+        'wget': WGET_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
+    
+    return {
+        'wget': WGET_EXTRACTOR,
+        'warc': WARC_EXTRACTOR,
+    }
+
+@abx.hookimpl
+def ready():
+    from .config import WGET_CONFIG
+    WGET_CONFIG.validate()
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/binaries.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/binaries.py
@ -0,0 +1,18 @@
+__package__ = 'abx_plugin_wget'
+
+from typing import List
+
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, Binary
+
+from abx_plugin_default_binproviders import apt, brew, env
+
+from .config import WGET_CONFIG
+
+
+class WgetBinary(Binary):
+    name: BinName = WGET_CONFIG.WGET_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+WGET_BINARY = WgetBinary()
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/config.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/config.py
@ -0,0 +1,69 @@
+import subprocess
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.misc.logging import STDERR
+
+
+class WgetConfig(BaseConfigSet):
+
+    SAVE_WGET: bool = True
+    SAVE_WARC: bool = True
+    
+    USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
+    
+    WGET_BINARY: str = Field(default='wget')
+    WGET_ARGS: List[str] = [
+        '--no-verbose',
+        '--adjust-extension',
+        '--convert-links',
+        '--force-directories',
+        '--backup-converted',
+        '--span-hosts',
+        '--no-parent',
+        '-e', 'robots=off',
+    ]
+    WGET_EXTRA_ARGS: List[str] = []
+    
+    SAVE_WGET_REQUISITES: bool = Field(default=True)
+    WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    WGET_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+    def validate(self):
+        if self.USE_WGET and self.WGET_TIMEOUT < 10:
+            STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
+            STDERR.print('    wget will fail to archive any sites if set to less than ~20 seconds.')
+            STDERR.print('    (Setting it somewhere over 60 seconds is recommended)')
+            STDERR.print()
+            STDERR.print('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
+            STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
+            STDERR.print()
+        return self
+
+    @property
+    def WGET_AUTO_COMPRESSION(self) -> bool:
+        if hasattr(self, '_WGET_AUTO_COMPRESSION'):
+            return self._WGET_AUTO_COMPRESSION
+        try:
+            cmd = [
+                self.WGET_BINARY,
+                "--compression=auto",
+                "--help",
+            ]
+            self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
+            return self._WGET_AUTO_COMPRESSION
+        except (FileNotFoundError, OSError):
+            self._WGET_AUTO_COMPRESSION = False
+            return False
+
+WGET_CONFIG = WgetConfig()
+
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/extractors.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/extractors.py
@ -0,0 +1,35 @@
+__package__ = 'abx_plugin_wget'
+
+# from pathlib import Path
+
+# from pydantic_pkgr import BinName
+
+# from .binaries import WGET_BINARY
+# from .wget_util import wget_output_path
+
+# class WgetExtractor(BaseExtractor):
+#     name: ExtractorName = 'wget'
+#     binary: BinName = WGET_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path | None:
+#         wget_index_path = wget_output_path(snapshot.as_link())
+#         if wget_index_path:
+#             return Path(wget_index_path)
+#         return None
+
+# WGET_EXTRACTOR = WgetExtractor()
+
+
+# class WarcExtractor(BaseExtractor):
+#     name: ExtractorName = 'warc'
+#     binary: BinName = WGET_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path | None:
+#         warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
+#         if warc_files:
+#             return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
+#         return None
+
+
+# WARC_EXTRACTOR = WarcExtractor()
+
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/wget.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/wget.py
@ -0,0 +1,290 @@
+__package__ = 'abx_plugin_wget_extractor'
+
+import re
+import os
+from pathlib import Path
+
+from typing import Optional
+from datetime import datetime, timezone
+
+from archivebox.misc.system import run, chmod_file
+from archivebox.misc.util import (
+    enforce_types,
+    without_fragment,
+    without_query,
+    path,
+    domain,
+    urldecode,
+    dedupe,
+)
+from .config import WGET_CONFIG
+from .binaries import WGET_BINARY
+
+from archivebox.logging_util import TimedProgress
+from archivebox.index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+
+
+def get_output_path():
+    # TODO: actually save output into this folder, instead of do {domain}/**/index.html
+    return 'wget/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    link = archiveresult.snapshot.as_link()
+    return wget_output_path(link)
+
+
+@enforce_types
+def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    output_path = wget_output_path(link)
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and output_path and (out_dir / output_path).exists():
+        return False
+
+    return WGET_CONFIG.SAVE_WGET
+
+
+@enforce_types
+def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=WGET_CONFIG.WGET_TIMEOUT) -> ArchiveResult:
+    """download full site using wget"""
+
+    out_dir = Path(out_dir or link.link_dir)
+    assert out_dir.exists()
+    
+    if WGET_CONFIG.SAVE_WARC:
+        warc_dir = out_dir / "warc"
+        warc_dir.mkdir(exist_ok=True)
+        warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
+
+    wget_binary = WGET_BINARY.load()
+    assert wget_binary.abspath and wget_binary.version
+
+    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
+    output: ArchiveOutput = None
+    # later options take precedence
+    options = [
+        *WGET_CONFIG.WGET_ARGS,
+        *WGET_CONFIG.WGET_EXTRA_ARGS,
+        '--timeout={}'.format(timeout),
+        *(['--restrict-file-names={}'.format(WGET_CONFIG.WGET_RESTRICT_FILE_NAMES)] if WGET_CONFIG.WGET_RESTRICT_FILE_NAMES else []),
+        *(['--warc-file={}'.format(str(warc_path))] if WGET_CONFIG.SAVE_WARC else []),
+        *(['--page-requisites'] if WGET_CONFIG.SAVE_WGET_REQUISITES else []),
+        *(['--user-agent={}'.format(WGET_CONFIG.WGET_USER_AGENT)] if WGET_CONFIG.WGET_USER_AGENT else []),
+        *(['--load-cookies', str(WGET_CONFIG.WGET_COOKIES_FILE)] if WGET_CONFIG.WGET_COOKIES_FILE else []),
+        *(['--compression=auto'] if WGET_CONFIG.WGET_AUTO_COMPRESSION else []),
+        *([] if WGET_CONFIG.SAVE_WARC else ['--timestamping']),
+        *([] if WGET_CONFIG.WGET_CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        # '--server-response',  # print headers for better error parsing
+    ]
+    cmd = [
+        str(wget_binary.abspath),
+        *dedupe(options),
+        link.url,
+    ]
+
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(out_dir), timeout=timeout)
+        output = wget_output_path(link)
+
+        # parse out number of files downloaded from last line of stderr:
+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            if line.strip()
+        ]
+        files_downloaded = (
+            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
+            if 'Downloaded:' in output_tail[-1]
+            else 0
+        )
+        hints = (
+            'Got wget response code: {}.'.format(result.returncode),
+            *output_tail,
+        )
+
+        # Check for common failure cases
+        if (result.returncode > 0 and files_downloaded < 1) or output is None:
+            if b'403: Forbidden' in result.stderr:
+                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
+            if b'404: Not Found' in result.stderr:
+                raise ArchiveError('404 Not Found', hints)
+            if b'ERROR 500: Internal Server Error' in result.stderr:
+                raise ArchiveError('500 Internal Server Error', hints)
+            raise ArchiveError('Wget failed or got an error from the server', hints)
+        
+        if (out_dir / output).exists():
+            chmod_file(output, cwd=str(out_dir))
+        else:
+            print(f'          {out_dir}/{output}')
+            raise ArchiveError('Failed to find wget output after running', hints)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(wget_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
+
+
+@enforce_types
+def unsafe_wget_output_path(link: Link) -> Optional[str]:
+    # There used to be a bunch of complex reverse-engineering path mapping logic here,
+    # but it was removed in favor of just walking through the output folder recursively to try to find the
+    # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
+    # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
+    # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
+    full_path = without_fragment(without_query(path(link.url))).strip('/')
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
+    for _ in range(4):
+        try:
+            if os.access(search_dir, os.R_OK) and search_dir.is_dir():
+                html_files = [
+                    f for f in search_dir.iterdir()
+                    if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
+                ]
+                if html_files:
+                    return str(html_files[0].relative_to(link.link_dir))
+
+                # sometimes wget'd URLs have no ext and return non-html
+                # e.g. /some/example/rss/all -> some RSS XML content)
+                #      /some/other/url.o4g   -> some binary unrecognized ext)
+                # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
+                last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
+                for file_present in search_dir.iterdir():
+                    if file_present == last_part_of_url:
+                        return str((search_dir / file_present).relative_to(link.link_dir))
+        except OSError:
+            # OSError 36 and others can happen here, caused by trying to check for impossible paths
+            # (paths derived from URLs can often contain illegal unicode characters or be too long,
+            # causing the OS / filesystem to reject trying to open them with a system-level error)
+            pass
+
+        # Move up one directory level
+        search_dir = search_dir.parent
+
+        if str(search_dir) == link.link_dir:
+            break
+
+    # check for literally any file present that isnt an empty folder
+    domain_dir = Path(domain(link.url).replace(":", "+"))
+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
+    if files_within:
+        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
+
+    # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
+    # that it's better we just pretend it doesnt exist
+    # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
+    return None
+
+
+@enforce_types
+def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
+
+    WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
+    is basically impossible. Every OS and filesystem have different requirements on what special characters are
+    allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
+    that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
+    accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
+    wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
+    complicated attempt to do this. Here be dragons:
+        - https://github.com/ArchiveBox/ArchiveBox/issues/549
+        - https://github.com/ArchiveBox/ArchiveBox/issues/1373
+        - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
+        - and probably many more that I didn't realize were caused by this...
+
+    The only constructive thing we could possibly do to this function is to figure out how to remove it.
+
+    Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
+    and pray you never have to deal with the aftermath of someone else's attempt to do so...
+    """
+    
+    # Wget downloads can save in a number of different ways depending on the url:
+    #    https://example.com
+    #       > example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc
+    #       > example.com/abc.html
+    #    https://example.com/abc/
+    #       > example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > example.com/abc@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > example.com/abc/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc/test.html
+    #       > example.com/abc/test.html
+    #    https://example.com/abc/test?v=zzVa_tX1OiI
+    #       > example.com/abc/test@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/test/?v=zzVa_tX1OiI
+    #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
+
+    cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path'
+    
+    if not nocache:
+        from django.core.cache import cache
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
+    # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
+    # 4 characters, paths with multipe extensions, etc. the list goes on...
+
+    output_path = None
+    try:
+        output_path = unsafe_wget_output_path(link)
+    except Exception as err:
+        pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
+
+    # check for unprintable unicode characters
+    # https://github.com/ArchiveBox/ArchiveBox/issues/1373
+    if output_path:
+        safe_path = output_path.encode('utf-8', 'replace').decode()
+        if output_path != safe_path:
+            # contains unprintable unicode characters that will break other parts of archivebox
+            # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
+            output_path = None
+
+    # check for a path that is just too long to safely handle across different OS's
+    # https://github.com/ArchiveBox/ArchiveBox/issues/549
+    if output_path and len(output_path) > 250:
+        output_path = None
+
+    if output_path:
+        if not nocache:
+            cache.set(cache_key, output_path)
+        return output_path
+
+    # fallback to just the domain dir
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
+    if os.access(search_dir, os.R_OK) and search_dir.is_dir():
+        return domain(link.url).replace(":", "+")
+
+    # fallback to just the domain dir without port
+    search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
+    if os.access(search_dir, os.R_OK) and search_dir.is_dir():
+        return domain(link.url).split(":", 1)[0]
+
+    return None
--- a/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/wget_util.py
+++ b/archivebox/vendor/abx-plugin-wget/abx_plugin_wget/wget_util.py
@ -0,0 +1,169 @@
+__package__ = 'archivebox.extractors'
+
+import re
+import os
+from pathlib import Path
+
+from typing import Optional
+
+
+from archivebox.misc.util import (
+    enforce_types,
+    without_fragment,
+    without_query,
+    path,
+    domain,
+    urldecode,
+)
+
+@enforce_types
+def unsafe_wget_output_path(link) -> Optional[str]:
+    # There used to be a bunch of complex reverse-engineering path mapping logic here,
+    # but it was removed in favor of just walking through the output folder recursively to try to find the
+    # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
+    # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
+    # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
+    full_path = without_fragment(without_query(path(link.url))).strip('/')
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
+    for _ in range(4):
+        try:
+            if search_dir.exists():
+                if search_dir.is_dir():
+                    html_files = [
+                        f for f in search_dir.iterdir()
+                        if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
+                    ]
+                    if html_files:
+                        return str(html_files[0].relative_to(link.link_dir))
+
+                    # sometimes wget'd URLs have no ext and return non-html
+                    # e.g. /some/example/rss/all -> some RSS XML content)
+                    #      /some/other/url.o4g   -> some binary unrecognized ext)
+                    # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
+                    last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
+                    for file_present in search_dir.iterdir():
+                        if file_present == last_part_of_url:
+                            return str((search_dir / file_present).relative_to(link.link_dir))
+        except OSError:
+            # OSError 36 and others can happen here, caused by trying to check for impossible paths
+            # (paths derived from URLs can often contain illegal unicode characters or be too long,
+            # causing the OS / filesystem to reject trying to open them with a system-level error)
+            pass
+
+        # Move up one directory level
+        search_dir = search_dir.parent
+
+        if str(search_dir) == link.link_dir:
+            break
+
+    # check for literally any file present that isnt an empty folder
+    domain_dir = Path(domain(link.url).replace(":", "+"))
+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
+    if files_within:
+        return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
+
+    # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
+    # that it's better we just pretend it doesnt exist
+    # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
+    return None
+
+
+@enforce_types
+def wget_output_path(link, nocache: bool=False) -> Optional[str]:
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
+
+    WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
+    is basically impossible. Every OS and filesystem have different requirements on what special characters are
+    allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
+    that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
+    accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
+    wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
+    complicated attempt to do this. Here be dragons:
+        - https://github.com/ArchiveBox/ArchiveBox/issues/549
+        - https://github.com/ArchiveBox/ArchiveBox/issues/1373
+        - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
+        - and probably many more that I didn't realize were caused by this...
+
+    The only constructive thing we could possibly do to this function is to figure out how to remove it.
+
+    Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
+    and pray you never have to deal with the aftermath of someone else's attempt to do so...
+    """
+    
+    # Wget downloads can save in a number of different ways depending on the url:
+    #    https://example.com
+    #       > example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > example.com/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc
+    #       > example.com/abc.html
+    #    https://example.com/abc/
+    #       > example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > example.com/abc@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > example.com/abc/index.html@v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc/test.html
+    #       > example.com/abc/test.html
+    #    https://example.com/abc/test?v=zzVa_tX1OiI
+    #       > example.com/abc/test@v=zzVa_tX1OiI.html
+    #    https://example.com/abc/test/?v=zzVa_tX1OiI
+    #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
+
+    cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path'
+    
+    if not nocache:
+        from django.core.cache import cache
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
+    # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
+    # 4 characters, paths with multipe extensions, etc. the list goes on...
+
+    output_path = None
+    try:
+        output_path = unsafe_wget_output_path(link)
+    except Exception as err:
+        pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
+
+    # check for unprintable unicode characters
+    # https://github.com/ArchiveBox/ArchiveBox/issues/1373
+    if output_path:
+        safe_path = output_path.encode('utf-8', 'replace').decode()
+        if output_path != safe_path:
+            # contains unprintable unicode characters that will break other parts of archivebox
+            # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
+            output_path = None
+
+    # check for a path that is just too long to safely handle across different OS's
+    # https://github.com/ArchiveBox/ArchiveBox/issues/549
+    if output_path and len(output_path) > 250:
+        output_path = None
+
+    if output_path:
+        if not nocache:
+            cache.set(cache_key, output_path)
+        return output_path
+
+    # fallback to just the domain dir
+    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
+    if os.access(search_dir, os.R_OK) and search_dir.is_dir():
+        return domain(link.url).replace(":", "+")
+
+    # fallback to just the domain dir without port
+    search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
+    if os.access(search_dir, os.R_OK) and search_dir.is_dir():
+        return domain(link.url).split(":", 1)[0]
+
+    return None
--- a/archivebox/vendor/abx-plugin-wget/pyproject.toml
+++ b/archivebox/vendor/abx-plugin-wget/pyproject.toml
@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-wget"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_wget = "abx_plugin_wget"