mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-23 03:06:55 -04:00
Add htmltotext extractor
Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing.
This commit is contained in:
parent
6555719489
commit
310b4d1242
9 changed files with 203 additions and 104 deletions
|
@ -429,6 +429,7 @@ class Link:
|
|||
'singlefile_path': 'singlefile.html',
|
||||
'readability_path': 'readability/content.html',
|
||||
'mercury_path': 'mercury/content.html',
|
||||
'htmltotext_path': 'htmltotext.txt',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
|
@ -452,6 +453,7 @@ class Link:
|
|||
'singlefile_path': static_path,
|
||||
'readability_path': static_path,
|
||||
'mercury_path': static_path,
|
||||
'htmltotext_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue