Merge branch 'dev' into link-removal2

This commit is contained in:
Nick Sweeting 2021-02-01 02:46:57 -05:00
commit 3eaf580fc0
16 changed files with 226 additions and 129 deletions

View file

@ -47,14 +47,13 @@ def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIM
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(snapshot.url)),
]
status = 'pending'
status = 'failed'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, cwd=str(out_dir), timeout=timeout)
chmod_file(output, cwd=str(out_dir))
status = 'succeeded'
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()

View file

@ -42,7 +42,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
"""download full site using single-file"""
out_dir = out_dir or Path(snapshot.snapshot_dir)
output = str(out_dir.absolute() / "singlefile.html")
output = "singlefile.html"
browser_args = chrome_args(TIMEOUT=0)
@ -54,6 +54,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
browser_args,
snapshot.url,
output
output,
]
status = 'succeeded'
@ -74,9 +75,9 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
)
# Check for common failure cases
if (result.returncode > 0):
if (result.returncode > 0) or not (out_dir / output).is_file():
raise ArchiveError('SingleFile was not able to archive the page', hints)
chmod_file(output)
chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).

View file

@ -10,7 +10,6 @@ from django.db.models import Model
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
is_static_file,
download_url,
htmldecode,
)
@ -65,11 +64,8 @@ class TitleParser(HTMLParser):
# output = '{title}'
@enforce_types
def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
if is_static_file(snapshot.url):
False
# if snapshot already has valid title, skip it
def should_save_title(snapshot: Model, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
# if link already has valid title, skip it
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
return False
@ -118,7 +114,11 @@ def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEO
.update(title=output)
snapshot.title = output
else:
raise ArchiveError('Unable to detect page title')
# if no content was returned, dont save a title (because it might be a temporary error)
if not html:
raise ArchiveError('Unable to detect page title')
# output = html[:128] # use first bit of content as the title
output = link.base_url # use the filename as the title (better UX)
except Exception as err:
status = 'failed'
output = err

View file

@ -12,8 +12,6 @@ from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
without_scheme,
without_fragment,
without_query,
path,
@ -107,7 +105,12 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Wget failed or got an error from the server', hints)
chmod_file(output, cwd=str(out_dir))
if (out_dir / output).exists():
chmod_file(output, cwd=str(out_dir))
else:
print(f' {out_dir}/{output}')
raise ArchiveError('Failed to find wget output after running', hints)
except Exception as err:
status = 'failed'
output = err
@ -131,8 +134,6 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
See docs on wget --adjust-extension (-E)
"""
if is_static_file(snapshot.url):
return without_scheme(without_fragment(snapshot.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
@ -184,7 +185,7 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str(search_dir / file_present)
return str((search_dir / file_present).relative_to(snapshot.snapshot_dir))
# Move up one directory level
search_dir = search_dir.parent
@ -192,10 +193,15 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
if search_dir == snapshot.snapshot_dir:
break
# check for literally any file present that isnt an empty folder
domain_dir = Path(domain(snapshot.url).replace(":", "+"))
files_within = list((Path(snapshot.snapshot_dir) / domain_dir).glob('**/*.*'))
if files_within:
return str((domain_dir / files_within[-1]).relative_to(snapshot.snapshot_dir))
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
if not search_dir.is_dir():
return str(search_dir.relative_to(snapshot.snapshot_dir))
# fallback to just the domain dir, dont try to introspect further inside it
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+")
if search_dir.is_dir():
return domain(snapshot.url).replace(":", "+")
return None