mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 23:54:27 -04:00
refactor: headers uses snapshot instead of link
This commit is contained in:
parent
29ec48a35f
commit
b9489c971c
1 changed files with 9 additions and 7 deletions
|
@ -4,6 +4,8 @@ from pathlib import Path
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from django.db.models import Model
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
|
@ -22,18 +24,18 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
|
def should_save_headers(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
|
|
||||||
output = Path(out_dir or link.link_dir) / 'headers.json'
|
output = Path(out_dir or snapshot.snapshot_dir) / 'headers.json'
|
||||||
return not output.exists() and SAVE_HEADERS
|
return not output.exists() and SAVE_HEADERS
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_headers(snapshot: Model, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
"""Download site headers"""
|
"""Download site headers"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or snapshot.snapshot_dir)
|
||||||
output_folder = out_dir.absolute()
|
output_folder = out_dir.absolute()
|
||||||
output: ArchiveOutput = 'headers.json'
|
output: ArchiveOutput = 'headers.json'
|
||||||
|
|
||||||
|
@ -47,10 +49,10 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
link.url,
|
snapshot.url,
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
json_headers = get_headers(link.url, timeout=timeout)
|
json_headers = get_headers(snapshot.url, timeout=timeout)
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
atomic_write(str(output_folder / "headers.json"), json_headers)
|
atomic_write(str(output_folder / "headers.json"), json_headers)
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue