new archivebox update speed improvements

This commit is contained in:
Nick Sweeting 2024-02-22 04:49:09 -08:00
parent 2d32f05a62
commit 6a4e568d1b
8 changed files with 36 additions and 13 deletions

View file

@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output = "htmltotext.txt"
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
timer = TimedProgress(timeout, prefix=' ')
extracted_text = None
status = 'failed'
try:
extractor = HTMLTextExtractor()
document = get_html(link, out_dir)
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
extracted_text = str(extractor)
atomic_write(str(out_dir / output), extracted_text)
status = 'succeeded'
except (Exception, OSError) as err:
status = 'failed'
output = err
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
finally:
timer.end()