Add htmltotext extractor

Saves HTML text nodes and selected element attributes in
`htmltotext.txt` for each Snapshot. Primarily intended to be used
for search indexing.
This commit is contained in:
Ross Williams 2023-10-23 21:42:25 -04:00
parent 6555719489
commit 310b4d1242
9 changed files with 203 additions and 104 deletions

View file

@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str:
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon", "title", "headers", "archive_org"]
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None)