mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-17 08:34:26 -04:00
refactor: wget uses snapshot instead of link
This commit is contained in:
parent
e0e65bf4b1
commit
f6152ded44
2 changed files with 15 additions and 14 deletions
|
@ -6,6 +6,8 @@ from pathlib import Path
|
|||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from django.db.models import Model
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
|
@ -36,9 +38,9 @@ from ..logging_util import TimedProgress
|
|||
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:
|
||||
output_path = wget_output_path(link)
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
def should_save_wget(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
output_path = wget_output_path(snapshot)
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if output_path and (out_dir / output_path).exists():
|
||||
return False
|
||||
|
||||
|
@ -46,7 +48,7 @@ def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using wget"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
@ -70,14 +72,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
link.url,
|
||||
snapshot.url,
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
output = wget_output_path(snapshot)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
|
@ -123,14 +125,14 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
def wget_output_path(snapshot: Model) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
if is_static_file(link.url):
|
||||
return without_scheme(without_fragment(link.url))
|
||||
if is_static_file(snapshot.url):
|
||||
return without_scheme(without_fragment(snapshot.url))
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
|
@ -163,8 +165,8 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|||
# and there's no way to get the computed output path from wget
|
||||
# in order to avoid having to reverse-engineer how they calculate it,
|
||||
# we just look in the output folder read the filename wget used from the filesystem
|
||||
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||
full_path = without_fragment(without_query(path(snapshot.url))).strip('/')
|
||||
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
|
||||
for _ in range(4):
|
||||
if search_dir.exists():
|
||||
if search_dir.is_dir():
|
||||
|
@ -173,12 +175,12 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
return str(html_files[0].relative_to(link.link_dir))
|
||||
return str(html_files[0].relative_to(snapshot.snapshot_dir))
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.parent
|
||||
|
||||
if str(search_dir) == link.link_dir:
|
||||
if str(search_dir) == snapshot.snapshot_dir:
|
||||
break
|
||||
|
||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||
|
|
|
@ -87,7 +87,6 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
|
|||
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
print(snapshot._asdict())
|
||||
atomic_write(str(path), snapshot._asdict())
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue