ArchiveBox/archivebox/search/utils.py
Ross Williams 6555719489 Add space after tags when extracting text
Add space after any close tag to ensure that
tokens that would be rendered separate in HTML
get extracted as separate tokens in text.

Example:

`<p>First</p><p>Second</p>` --> `First Second`
NOT `FirstSecond`
2023-10-16 09:59:08 -04:00

140 lines
4.2 KiB
Python

from html.parser import HTMLParser
import io
from django.db.models import QuerySet
from archivebox.util import enforce_types
from archivebox.config import ANSI, SEARCH_PROCESS_HTML
BLOCK_SIZE = 32768
def log_index_started(url):
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
print( )
class HTMLTextExtractor(HTMLParser):
TEXT_ATTRS = ["alt", "cite", "href", "label", "list", "placeholder", "title", "value"]
NOTEXT_TAGS = ["script", "style", "template"]
NOTEXT_HREF = ["data:", "javascript:", "#"]
def __init__(self):
super().__init__()
self.output = io.StringIO()
self._tag_stack = []
def _is_text_attr(self, name, value):
if not isinstance(value, str):
return False
if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)):
return False
if name in self.TEXT_ATTRS:
return True
return False
def _parent_tag(self):
try:
return self._tag_stack[-1]
except IndexError:
return None
def _in_notext_tag(self):
return any([t in self._tag_stack for t in self.NOTEXT_TAGS])
def handle_starttag(self, tag, attrs):
self._tag_stack.append(tag)
# Don't write out attribute values if any ancestor
# is in NOTEXT_TAGS
if self._in_notext_tag():
return
for name, value in attrs:
if self._is_text_attr(name, value):
self.output.write(value.strip())
self.output.write(" ")
def handle_endtag(self, tag):
orig_stack = self._tag_stack.copy()
try:
# Keep popping tags until we find the nearest
# ancestor matching this end tag
while tag != self._tag_stack.pop():
pass
# Write a space after every tag, to ensure that tokens
# in tag text aren't concatenated. This may result in
# excess spaces, which should be ignored by search tokenizers.
if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
self.output.write(" ")
except IndexError:
# Got to the top of the stack, but somehow missed
# this end tag -- maybe malformed markup -- restore the
# stack
self._tag_stack = orig_stack
def handle_data(self, data):
# Don't output text data if any ancestor is in NOTEXT_TAGS
if self._in_notext_tag():
return
self.output.write(data)
def __str__(self):
return self.output.getvalue()
def _read_all(file: io.TextIOBase) -> str:
return file.read()
def _extract_html_text(file: io.TextIOBase) -> str:
extractor = HTMLTextExtractor()
while (block := file.read(BLOCK_SIZE)):
extractor.feed(block)
else:
extractor.close()
return str(extractor)
def get_file_result_content(res, extra_path, use_pwd=False, *, filter=_read_all):
if use_pwd:
fpath = f'{res.pwd}/{res.output}'
else:
fpath = f'{res.output}'
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r', encoding='utf-8', errors='replace') as file:
data = filter(file)
if data:
return [data]
return []
# This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
return []
# Only use the first method available
res, method = results.first(), results.first().extractor
if method not in ('readability', 'singlefile', 'dom', 'wget'):
return []
# This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile':
filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all
return get_file_result_content(res, '', use_pwd=True, filter=filter)
elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget':
return get_file_result_content(res, '', use_pwd=True)