mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
Add htmltotext extractor
Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing.
This commit is contained in:
parent
6555719489
commit
310b4d1242
9 changed files with 203 additions and 104 deletions
|
@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str:
|
|||
"mercury": "🅼",
|
||||
"warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
||||
# Missing specific entry for WARC
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue