Fix regression in search interaction with wide unicode characters.

Apparently, we had incorrectly concluded that our highlight_string
search result highlighting offsets coming from tsearch_extras were
measured in bytes, whereas in fact it is measured in characters.
This commit is contained in:
Tim Abbott 2016-08-24 23:00:52 -07:00
parent 88fce4761a
commit a90b470205
1 changed files with 9 additions and 7 deletions

View File

@ -10,7 +10,7 @@ from django.db.models import Q
from django.http import HttpRequest, HttpResponse
from six import text_type
from typing import Any, AnyStr, Iterable, Optional, Tuple
from zerver.lib.str_utils import force_bytes
from zerver.lib.str_utils import force_text
from zerver.decorator import authenticated_api_view, authenticated_json_post_view, \
has_request_variables, REQ, JsonableError, \
@ -261,14 +261,16 @@ class NarrowBuilder(object):
cond = column("search_tsvector").op("@@")(tsquery)
return query.where(maybe_negate(cond))
# Apparently, the offsets we get from tsearch_extras are counted in
# unicode characters, not in bytes, so we do our processing with text,
# not bytes.
def highlight_string(text, locs):
# type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
string = force_bytes(text)
# Do all operations on bytes because tsearch_extras counts bytes instead of characters.
highlight_start = b'<span class="highlight">'
highlight_stop = b'</span>'
string = force_text(text)
highlight_start = u'<span class="highlight">'
highlight_stop = u'</span>'
pos = 0
result = b''
result = u''
for loc in locs:
(offset, length) = loc
result += string[pos:offset]
@ -277,7 +279,7 @@ def highlight_string(text, locs):
result += highlight_stop
pos = offset + length
result += string[pos:]
return result.decode('utf-8')
return result
def get_search_fields(rendered_content, subject, content_matches, subject_matches):
# type: (text_type, text_type, Iterable[Tuple[int, int]], Iterable[Tuple[int, int]]) -> Dict[str, text_type]