mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-06 01:21:22 -04:00
Merge branch 'dev' into link-removal2
This commit is contained in:
commit
3eaf580fc0
16 changed files with 226 additions and 129 deletions
|
@ -47,14 +47,13 @@ def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIM
|
|||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(snapshot.url)),
|
||||
]
|
||||
status = 'pending'
|
||||
status = 'failed'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
|
|
@ -42,7 +42,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
|
|||
"""download full site using single-file"""
|
||||
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
output = str(out_dir.absolute() / "singlefile.html")
|
||||
output = "singlefile.html"
|
||||
|
||||
browser_args = chrome_args(TIMEOUT=0)
|
||||
|
||||
|
@ -54,6 +54,7 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
|
|||
browser_args,
|
||||
snapshot.url,
|
||||
output
|
||||
output,
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
|
@ -74,9 +75,9 @@ def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=
|
|||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0):
|
||||
if (result.returncode > 0) or not (out_dir / output).is_file():
|
||||
raise ArchiveError('SingleFile was not able to archive the page', hints)
|
||||
chmod_file(output)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||
|
|
|
@ -10,7 +10,6 @@ from django.db.models import Model
|
|||
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
download_url,
|
||||
htmldecode,
|
||||
)
|
||||
|
@ -65,11 +64,8 @@ class TitleParser(HTMLParser):
|
|||
# output = '{title}'
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
if is_static_file(snapshot.url):
|
||||
False
|
||||
|
||||
# if snapshot already has valid title, skip it
|
||||
def should_save_title(snapshot: Model, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
|
@ -118,7 +114,11 @@ def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
.update(title=output)
|
||||
snapshot.title = output
|
||||
else:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
# if no content was returned, dont save a title (because it might be a temporary error)
|
||||
if not html:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
# output = html[:128] # use first bit of content as the title
|
||||
output = link.base_url # use the filename as the title (better UX)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
|
|
|
@ -12,8 +12,6 @@ from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
|
|||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
without_scheme,
|
||||
without_fragment,
|
||||
without_query,
|
||||
path,
|
||||
|
@ -107,7 +105,12 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
|
|||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||
raise ArchiveError('500 Internal Server Error', hints)
|
||||
raise ArchiveError('Wget failed or got an error from the server', hints)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
|
||||
if (out_dir / output).exists():
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
else:
|
||||
print(f' {out_dir}/{output}')
|
||||
raise ArchiveError('Failed to find wget output after running', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
|
@ -131,8 +134,6 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
|
|||
|
||||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
if is_static_file(snapshot.url):
|
||||
return without_scheme(without_fragment(snapshot.url))
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
|
@ -184,7 +185,7 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
|
|||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||
for file_present in search_dir.iterdir():
|
||||
if file_present == last_part_of_url:
|
||||
return str(search_dir / file_present)
|
||||
return str((search_dir / file_present).relative_to(snapshot.snapshot_dir))
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.parent
|
||||
|
@ -192,10 +193,15 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
|
|||
if search_dir == snapshot.snapshot_dir:
|
||||
break
|
||||
|
||||
|
||||
# check for literally any file present that isnt an empty folder
|
||||
domain_dir = Path(domain(snapshot.url).replace(":", "+"))
|
||||
files_within = list((Path(snapshot.snapshot_dir) / domain_dir).glob('**/*.*'))
|
||||
if files_within:
|
||||
return str((domain_dir / files_within[-1]).relative_to(snapshot.snapshot_dir))
|
||||
|
||||
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
|
||||
if not search_dir.is_dir():
|
||||
return str(search_dir.relative_to(snapshot.snapshot_dir))
|
||||
# fallback to just the domain dir, dont try to introspect further inside it
|
||||
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+")
|
||||
if search_dir.is_dir():
|
||||
return domain(snapshot.url).replace(":", "+")
|
||||
|
||||
return None
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue