Skip to content

Commit

Permalink
Handle hyphens in user dir search porperly (#17254)
Browse files Browse the repository at this point in the history
c.f. #16675
  • Loading branch information
erikjohnston authored Jun 5, 2024
1 parent 7d8f0ef commit aabf577
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 6 deletions.
1 change: 1 addition & 0 deletions changelog.d/17254.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix searching for users with their exact localpart whose ID includes a hyphen.
66 changes: 60 additions & 6 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,7 @@ def _parse_words_with_regex(search_term: str) -> List[str]:
Break down search term into words, when we don't have ICU available.
See: `_parse_words`
"""
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
return re.findall(r"([\w-]+)", search_term, re.UNICODE)


def _parse_words_with_icu(search_term: str) -> List[str]:
Expand All @@ -1303,15 +1303,69 @@ def _parse_words_with_icu(search_term: str) -> List[str]:
if j < 0:
break

result = search_term[i:j]
# We want to make sure that we split on `@` and `:` specifically, as
# they occur in user IDs.
for result in re.split(r"[@:]+", search_term[i:j]):
results.append(result.strip())

i = j

# libicu will break up words that have punctuation in them, but to handle
# cases where user IDs have '-', '.' and '_' in them we want to *not* break
# those into words and instead allow the DB to tokenise them how it wants.
#
# In particular, user-71 in postgres gets tokenised to "user, -71", and this
# will not match a query for "user, 71".
new_results: List[str] = []
i = 0
while i < len(results):
curr = results[i]

prev = None
next = None
if i > 0:
prev = results[i - 1]
if i + 1 < len(results):
next = results[i + 1]

i += 1

# libicu considers spaces and punctuation between words as words, but we don't
# want to include those in results as they would result in syntax errors in SQL
# queries (e.g. "foo bar" would result in the search query including "foo & &
# bar").
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
results.append(result)
if not curr:
continue

if curr in ["-", ".", "_"]:
prefix = ""
suffix = ""

# Check if the next item is a word, and if so use it as the suffix.
# We check for if its a word as we don't want to concatenate
# multiple punctuation marks.
if next is not None and re.match(r"\w", next):
suffix = next
i += 1 # We're using next, so we skip it in the outer loop.
else:
# We want to avoid creating terms like "user-", as we should
# strip trailing punctuation.
continue

i = j
if prev and re.match(r"\w", prev) and new_results:
prefix = new_results[-1]
new_results.pop()

# We might not have a prefix here, but that's fine as we want to
# ensure that we don't strip preceding punctuation e.g. '-71'
# shouldn't be converted to '71'.

new_results.append(f"{prefix}{curr}{suffix}")
continue
elif not re.match(r"\w", curr):
# Ignore other punctuation
continue

new_results.append(curr)

return results
return new_results
39 changes: 39 additions & 0 deletions tests/handlers/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,45 @@ def test_ignore_display_names_with_null_codepoints(self) -> None:
{alice: ProfileInfo(display_name=None, avatar_url=MXC_DUMMY)},
)

def test_search_punctuation(self) -> None:
"""Test that you can search for a user that includes punctuation"""

searching_user = self.register_user("searcher", "password")
searching_user_tok = self.login("searcher", "password")

room_id = self.helper.create_room_as(
searching_user,
room_version=RoomVersions.V1.identifier,
tok=searching_user_tok,
)

# We want to test searching for users of the form e.g. "user-1", with
# various punctuation. We also test both where the prefix is numeric and
# alphanumeric, as e.g. postgres tokenises "user-1" as "user" and "-1".
i = 1
for char in ["-", ".", "_"]:
for use_numeric in [False, True]:
if use_numeric:
prefix1 = f"{i}"
prefix2 = f"{i+1}"
else:
prefix1 = f"a{i}"
prefix2 = f"a{i+1}"

local_user_1 = self.register_user(f"user{char}{prefix1}", "password")
local_user_2 = self.register_user(f"user{char}{prefix2}", "password")

self._add_user_to_room(room_id, RoomVersions.V1, local_user_1)
self._add_user_to_room(room_id, RoomVersions.V1, local_user_2)

results = self.get_success(
self.handler.search_users(searching_user, local_user_1, 20)
)["results"]
received_user_id_ordering = [result["user_id"] for result in results]
self.assertSequenceEqual(received_user_id_ordering[:1], [local_user_1])

i += 2


class TestUserDirSearchDisabled(unittest.HomeserverTestCase):
servlets = [
Expand Down
4 changes: 4 additions & 0 deletions tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,10 @@ def test_icu_word_boundary_punctuation(self) -> None:
),
)

self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"])
self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"])
self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"])

def test_regex_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the non-ICU tokeniser
Expand Down

0 comments on commit aabf577

Please sign in to comment.