mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
bufixes
This commit is contained in:
parent
1a16221752
commit
5c2bbe7efe
11 changed files with 44 additions and 61 deletions
|
@ -6,18 +6,18 @@ from typing import Optional, List, Dict, Tuple
|
|||
from collections import defaultdict
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, DEVNULL, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from ..config import (
|
||||
VERSION,
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
@ -45,17 +45,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T
|
|||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--silent',
|
||||
'--location',
|
||||
'--head',
|
||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -47,7 +47,7 @@ def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
|||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
with open(output_path, 'w+') as f:
|
||||
result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..system import chmod_file, run, PIPE
|
||||
from ..system import chmod_file, run
|
||||
from ..util import enforce_types, domain
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -38,14 +38,14 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [],
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||
]
|
||||
status = 'pending'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
run(cmd, cwd=out_dir, timeout=timeout)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -64,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -66,7 +66,7 @@ def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEO
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -45,7 +45,7 @@ def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
|
|
|
@ -5,7 +5,7 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -45,7 +45,7 @@ def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
|
|
|
@ -12,9 +12,11 @@ from ..util import (
|
|||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
@ -44,6 +46,11 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
|||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--silent',
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
link.url,
|
||||
'|',
|
||||
'grep',
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import Optional
|
|||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE
|
||||
from ..system import run
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -81,7 +81,7 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue