Merge branch 'dev' into method_allow_deny

This commit is contained in:
Nick Sweeting 2023-10-20 04:25:44 -07:00 committed by GitHub
commit 63ad43f46c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 4485 additions and 1748 deletions

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
result = run(cmd, cwd=out_dir, timeout=timeout)
try:
result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -26,7 +26,7 @@ from ..logging_util import TimedProgress
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
r'([^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)