Add htmltotext extractor

Saves HTML text nodes and selected element attributes in
`htmltotext.txt` for each Snapshot. Primarily intended to be used
for search indexing.
This commit is contained in:
Ross Williams 2023-10-23 21:42:25 -04:00
parent 6555719489
commit 310b4d1242
9 changed files with 203 additions and 104 deletions

View file

@ -429,6 +429,7 @@ class Link:
'singlefile_path': 'singlefile.html',
'readability_path': 'readability/content.html',
'mercury_path': 'mercury/content.html',
'htmltotext_path': 'htmltotext.txt',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
@ -452,6 +453,7 @@ class Link:
'singlefile_path': static_path,
'readability_path': static_path,
'mercury_path': static_path,
'htmltotext_path': static_path,
})
return canonical