mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-01 07:18:27 -04:00
new archivebox update speed improvements
This commit is contained in:
parent
2d32f05a62
commit
6a4e568d1b
8 changed files with 36 additions and 13 deletions
|
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output = "htmltotext.txt"
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
extracted_text = None
|
||||
status = 'failed'
|
||||
try:
|
||||
extractor = HTMLTextExtractor()
|
||||
document = get_html(link, out_dir)
|
||||
|
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
extracted_text = str(extractor)
|
||||
|
||||
atomic_write(str(out_dir / output), extracted_text)
|
||||
status = 'succeeded'
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue