refactor: wget uses snapshot instead of link

2025-05-17 08:34:26 -04:00 · 2020-12-29 12:38:36 -05:00 · 2020-12-29 12:38:36 -05:00 · f6152ded44
commit f6152ded44
parent e0e65bf4b1
2 changed files with 15 additions and 14 deletions
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -6,6 +6,8 @@ from pathlib import Path
 from typing import Optional
 from datetime import datetime

+from django.db.models import Model
+
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..system import run, chmod_file
 from ..util import (
@ -36,9 +38,9 @@ from ..logging_util import TimedProgress


@enforce_types
-def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:
-    output_path = wget_output_path(link)
-    out_dir = out_dir or Path(link.link_dir)
+def should_save_wget(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
+    output_path = wget_output_path(snapshot)
+    out_dir = out_dir or Path(snapshot.snapshot_dir)
    if output_path and (out_dir / output_path).exists():
        return False

@ -46,7 +48,7 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:


@enforce_types
-def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using wget"""

    out_dir = out_dir or link.link_dir
@ -70,14 +72,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
        *([] if SAVE_WARC else ['--timestamping']),
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
-        link.url,
+        snapshot.url,
    ]

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        result = run(cmd, cwd=str(out_dir), timeout=timeout)
-        output = wget_output_path(link)
+        output = wget_output_path(snapshot)

        # parse out number of files downloaded from last line of stderr:
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
@ -123,14 +125,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->


@enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
+def wget_output_path(snapshot: Model) -> Optional[str]:
    """calculate the path to the wgetted .html file, since wget may
    adjust some paths to be different than the base_url path.

    See docs on wget --adjust-extension (-E)
    """
-    if is_static_file(link.url):
-        return without_scheme(without_fragment(link.url))
+    if is_static_file(snapshot.url):
+        return without_scheme(without_fragment(snapshot.url))

    # Wget downloads can save in a number of different ways depending on the url:
    #    https://example.com
@ -163,8 +165,8 @@ def wget_output_path(link: Link) -> Optional[str]:
    # and there's no way to get the computed output path from wget
    # in order to avoid having to reverse-engineer how they calculate it,
    # we just look in the output folder read the filename wget used from the filesystem
-    full_path = without_fragment(without_query(path(link.url))).strip('/')
-    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
+    full_path = without_fragment(without_query(path(snapshot.url))).strip('/')
+    search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
    for _ in range(4):
        if search_dir.exists():
            if search_dir.is_dir():
@ -173,12 +175,12 @@ def wget_output_path(link: Link) -> Optional[str]:
                    if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
                ]
                if html_files:
-                    return str(html_files[0].relative_to(link.link_dir))
+                    return str(html_files[0].relative_to(snapshot.snapshot_dir))

        # Move up one directory level
        search_dir = search_dir.parent

-        if str(search_dir) == link.link_dir:
+        if str(search_dir) == snapshot.snapshot_dir:
            break
    
    search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -87,7 +87,6 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
    
    out_dir = out_dir or snapshot.snapshot_dir
    path = Path(out_dir) / JSON_INDEX_FILENAME
-    print(snapshot._asdict())
    atomic_write(str(path), snapshot._asdict())