From 1d823af5017b42327c98d6056e6efdc4d8c49f70 Mon Sep 17 00:00:00 2001 From: Jakob Borg Date: Wed, 15 Jun 2022 10:05:05 +0200 Subject: [PATCH] Correctly handle non-ASCII runes in patterns (fixes #54) When matching a row we calculate an index into the string, and this index was in runes. However when slicing the string Go uses byte indexes. This change tracks both, using the rune count to determine the correct length and the byte index to slice the string. --- glob_test.go | 10 ++++++++++ match/row.go | 15 +++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/glob_test.go b/glob_test.go index 810036f..fb6b2ff 100644 --- a/glob_test.go +++ b/glob_test.go @@ -162,6 +162,16 @@ func TestGlob(t *testing.T) { glob(true, pattern_prefix_suffix, fixture_prefix_suffix_match), glob(false, pattern_prefix_suffix, fixture_prefix_suffix_mismatch), + + glob(true, "155ö", "155ö"), + glob(true, "1?5ö", "155ö"), // <- + glob(true, "1?ö5", "15ö5"), + glob(true, "155helloö", "155helloö"), + glob(true, "1?5helloö", "155helloö"), // <- + glob(true, "1?ö5hello", "15ö5hello"), + glob(true, "1?5heöllo", "155heöllo"), + glob(true, "1ö?5", "1ö55"), // <- + glob(true, "ö1?5", "ö155"), } { t.Run("", func(t *testing.T) { g := MustCompile(test.pattern, test.delimiters...) diff --git a/match/row.go b/match/row.go index 4379042..a34d0a5 100644 --- a/match/row.go +++ b/match/row.go @@ -2,6 +2,7 @@ package match import ( "fmt" + "unicode/utf8" ) type Row struct { @@ -23,19 +24,21 @@ func (self Row) matchAll(s string) bool { for _, m := range self.Matchers { length := m.Len() - var next, i int - for next = range s[idx:] { - i++ - if i == length { + var runeCount, byteIdx int + var r rune + for _, r = range s[idx:] { + runeCount++ + byteIdx += utf8.RuneLen(r) + if runeCount == length { break } } - if i < length || !m.Match(s[idx:idx+next+1]) { + if runeCount < length || !m.Match(s[idx:idx+byteIdx]) { return false } - idx += next + 1 + idx += byteIdx } return true