mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-02 07:39:52 -04:00
Merge branch 'dev' into DanielBatteryStapler-patch-1
This commit is contained in:
commit
110a22ee32
9 changed files with 2138 additions and 1148 deletions
|
@ -159,10 +159,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--write-thumbnail',
|
'--write-thumbnail',
|
||||||
'--no-call-home',
|
'--no-call-home',
|
||||||
'--write-sub',
|
'--write-sub',
|
||||||
'--all-subs',
|
'--write-auto-subs',
|
||||||
# There are too many of these and youtube
|
|
||||||
# throttles you with HTTP error 429
|
|
||||||
#'--write-auto-subs',
|
|
||||||
'--convert-subs=srt',
|
'--convert-subs=srt',
|
||||||
'--yes-playlist',
|
'--yes-playlist',
|
||||||
'--continue',
|
'--continue',
|
||||||
|
@ -175,7 +172,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--ignore-errors',
|
'--ignore-errors',
|
||||||
'--geo-bypass',
|
'--geo-bypass',
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
|
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||||
]},
|
]},
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||||
|
|
||||||
hints = (
|
hints = (
|
||||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||||
for line in hints[:5] if line.strip()
|
for line in list(hints)[:5] if line.strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout
|
||||||
|
|
||||||
from .vendor.base32_crockford import encode as base32_encode # type: ignore
|
from .vendor.base32_crockford import encode as base32_encode # type: ignore
|
||||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||||
|
from os.path import lexists
|
||||||
|
from os import remove as remove_file
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import chardet
|
import chardet
|
||||||
|
@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
return cmd_args
|
return cmd_args
|
||||||
|
|
||||||
|
def chrome_cleanup():
|
||||||
|
"""
|
||||||
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||||
|
a timeout or other error
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .config import IN_DOCKER
|
||||||
|
|
||||||
|
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||||
|
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||||
|
|
||||||
def ansi_to_html(text):
|
def ansi_to_html(text):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env bash
|
#!/bin/bash
|
||||||
|
|
||||||
DATA_DIR="${DATA_DIR:-/data}"
|
DATA_DIR="${DATA_DIR:-/data}"
|
||||||
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
|
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
|
||||||
|
|
3254
package-lock.json
generated
3254
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -7,7 +7,8 @@
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
|
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
|
||||||
|
"playwright": "^1.37.1",
|
||||||
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
||||||
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
|
"single-file-cli": "^1.0.63"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue