mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
bufixes
This commit is contained in:
parent
1a16221752
commit
5c2bbe7efe
11 changed files with 44 additions and 61 deletions
|
@ -78,7 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
||||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||||
|
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}
|
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
|
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
||||||
|
|
||||||
|
|
|
@ -6,18 +6,18 @@ from typing import Optional, List, Dict, Tuple
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, DEVNULL, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
VERSION,
|
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
SAVE_ARCHIVE_DOT_ORG,
|
SAVE_ARCHIVE_DOT_ORG,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
CURL_VERSION,
|
CURL_VERSION,
|
||||||
CHECK_SSL_VALIDITY
|
CURL_USER_AGENT,
|
||||||
)
|
)
|
||||||
from ..cli.logging import TimedProgress
|
from ..cli.logging import TimedProgress
|
||||||
|
|
||||||
|
@ -45,17 +45,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
'--silent',
|
||||||
'--location',
|
'--location',
|
||||||
'--head',
|
'--head',
|
||||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
submit_url,
|
submit_url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||||
if content_location:
|
if content_location:
|
||||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -47,7 +47,7 @@ def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
with open(output_path, 'w+') as f:
|
with open(output_path, 'w+') as f:
|
||||||
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = result.stderr.decode()
|
hints = result.stderr.decode()
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..system import chmod_file, run, PIPE
|
from ..system import chmod_file, run
|
||||||
from ..util import enforce_types, domain
|
from ..util import enforce_types, domain
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -38,14 +38,14 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
'--location',
|
'--location',
|
||||||
'--output', str(output),
|
'--output', str(output),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [],
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||||
]
|
]
|
||||||
status = 'pending'
|
status = 'pending'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
chmod_file(output, cwd=out_dir)
|
chmod_file(output, cwd=out_dir)
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -64,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||||
|
|
||||||
if result.returncode == 128:
|
if result.returncode == 128:
|
||||||
# ignore failed re-download when the folder already exists
|
# ignore failed re-download when the folder already exists
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -66,7 +66,7 @@ def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEO
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||||
chmod_file(output, cwd=out_dir)
|
chmod_file(output, cwd=out_dir)
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
if (b'ERROR: Unsupported URL' in result.stderr
|
if (b'ERROR: Unsupported URL' in result.stderr
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -45,7 +45,7 @@ def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout).decode()
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -45,7 +45,7 @@ def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout).decode()
|
||||||
|
|
|
@ -12,9 +12,11 @@ from ..util import (
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
SAVE_TITLE,
|
SAVE_TITLE,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
CURL_VERSION,
|
CURL_VERSION,
|
||||||
|
CURL_USER_AGENT,
|
||||||
)
|
)
|
||||||
from ..cli.logging import TimedProgress
|
from ..cli.logging import TimedProgress
|
||||||
|
|
||||||
|
@ -44,6 +46,11 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
'--silent',
|
||||||
|
'--max-time', str(timeout),
|
||||||
|
'--location',
|
||||||
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
link.url,
|
link.url,
|
||||||
'|',
|
'|',
|
||||||
'grep',
|
'grep',
|
||||||
|
|
|
@ -7,7 +7,7 @@ from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, PIPE
|
from ..system import run
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -81,7 +81,7 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
output = wget_output_path(link)
|
output = wget_output_path(link)
|
||||||
|
|
||||||
# parse out number of files downloaded from last line of stderr:
|
# parse out number of files downloaded from last line of stderr:
|
||||||
|
|
|
@ -4,69 +4,44 @@ __package__ = 'archivebox'
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import json as pyjson
|
from json import dump
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional, Union, Set, Tuple
|
from typing import Optional, Union, Set, Tuple
|
||||||
|
from subprocess import run as subprocess_run
|
||||||
|
|
||||||
from crontab import CronTab
|
from crontab import CronTab
|
||||||
from atomicwrites import atomic_write as awrite
|
from atomicwrites import atomic_write as lib_atomic_write
|
||||||
|
|
||||||
from subprocess import (
|
|
||||||
Popen,
|
|
||||||
PIPE,
|
|
||||||
DEVNULL,
|
|
||||||
CompletedProcess,
|
|
||||||
TimeoutExpired,
|
|
||||||
CalledProcessError,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .util import enforce_types, ExtendedEncoder
|
from .util import enforce_types, ExtendedEncoder
|
||||||
from .config import OUTPUT_PERMISSIONS
|
from .config import OUTPUT_PERMISSIONS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
def run(*args, input=None, capture_output=True, text=True, timeout=None, check=False, **kwargs):
|
||||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||||
|
|
||||||
if input is not None:
|
if input is not None:
|
||||||
if 'stdin' in kwargs:
|
if 'stdin' in kwargs:
|
||||||
raise ValueError('stdin and input arguments may not both be used.')
|
raise ValueError('stdin and input arguments may not both be used.')
|
||||||
kwargs['stdin'] = PIPE
|
|
||||||
|
|
||||||
if capture_output:
|
if capture_output:
|
||||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||||
raise ValueError('stdout and stderr arguments may not be used '
|
raise ValueError('stdout and stderr arguments may not be used '
|
||||||
'with capture_output.')
|
'with capture_output.')
|
||||||
kwargs['stdout'] = PIPE
|
|
||||||
kwargs['stderr'] = PIPE
|
|
||||||
|
|
||||||
with Popen(*popenargs, **kwargs) as process:
|
return subprocess_run(*args, input=input, capture_output=capture_output, text=text, timeout=timeout, check=check, **kwargs)
|
||||||
try:
|
|
||||||
stdout, stderr = process.communicate(input, timeout=timeout)
|
|
||||||
except TimeoutExpired:
|
|
||||||
process.kill()
|
|
||||||
try:
|
|
||||||
stdout, stderr = process.communicate(input, timeout=2)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
raise TimeoutExpired(popenargs[0][0], timeout)
|
|
||||||
except BaseException:
|
|
||||||
process.kill()
|
|
||||||
# We don't call process.wait() as .__exit__ does that for us.
|
|
||||||
raise
|
|
||||||
retcode = process.poll()
|
|
||||||
if check and retcode:
|
|
||||||
raise CalledProcessError(retcode, process.args,
|
|
||||||
output=stdout, stderr=stderr)
|
|
||||||
return CompletedProcess(process.args, retcode, stdout, stderr)
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
|
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
|
||||||
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
||||||
|
|
||||||
with awrite(path, overwrite=overwrite) as f:
|
mode = 'wb+' if isinstance(contents, bytes) else 'w'
|
||||||
|
|
||||||
|
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
|
||||||
|
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
|
||||||
if isinstance(contents, dict):
|
if isinstance(contents, dict):
|
||||||
pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
||||||
else:
|
elif isinstance(contents, (bytes, str)):
|
||||||
f.write(contents)
|
f.write(contents)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -76,7 +51,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
|
||||||
if not os.path.exists(os.path.join(cwd, path)):
|
if not os.path.exists(os.path.join(cwd, path)):
|
||||||
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
||||||
|
|
||||||
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, timeout=timeout)
|
||||||
if chmod_result.returncode == 1:
|
if chmod_result.returncode == 1:
|
||||||
print(' ', chmod_result.stderr.decode())
|
print(' ', chmod_result.stderr.decode())
|
||||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue