From 2823642c1233f68fda558769d7f9f40ed3eeaf5c Mon Sep 17 00:00:00 2001 From: Adriano Caloiaro Date: Sat, 7 Dec 2019 17:56:41 -0500 Subject: [PATCH 1/2] Add normalized and normalized-fold varieties of match/find/rank --- fuzzy/fuzzy.go | 126 +++++++++++++++++++++++++++------- fuzzy/fuzzy_test.go | 163 +++++++++++++++++++++++++++++++++++++++++++- go.mod | 2 + 3 files changed, 266 insertions(+), 25 deletions(-) diff --git a/fuzzy/fuzzy.go b/fuzzy/fuzzy.go index 33d4898..1751372 100644 --- a/fuzzy/fuzzy.go +++ b/fuzzy/fuzzy.go @@ -3,11 +3,19 @@ package fuzzy import ( + "bytes" "unicode" "unicode/utf8" + + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" ) -var noop = func(r rune) rune { return r } +var foldTransformer = unicodeFoldTransformer{} +var noopTransformer = transform.Nop +var normalizeTransformer = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) +var normalizeFoldTransformer = transform.Chain(normalizeTransformer, foldTransformer) // Match returns true if source matches target using a fuzzy-searching // algorithm. Note that it doesn't implement Levenshtein distance (see @@ -15,15 +23,28 @@ var noop = func(r rune) rune { return r } // approximation. The method will return true only if each character in the // source can be found in the target and occurs after the preceding matches. func Match(source, target string) bool { - return match(source, target, noop) + return match(source, target, noopTransformer) } // MatchFold is a case-insensitive version of Match. func MatchFold(source, target string) bool { - return match(source, target, unicode.ToLower) + return match(source, target, foldTransformer) +} + +// MatchNormalized is a unicode-normalized version of Match. +func MatchNormalized(source, target string) bool { + return match(source, target, normalizeTransformer) } -func match(source, target string, fn func(rune) rune) bool { +// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match. +func MatchNormalizedFold(source, target string) bool { + return match(source, target, normalizeFoldTransformer) +} + +func match(source, target string, transformer transform.Transformer) bool { + source = stringTransform(source, transformer) + target = stringTransform(target, transformer) + lenDiff := len(target) - len(source) if lenDiff < 0 { @@ -37,7 +58,7 @@ func match(source, target string, fn func(rune) rune) bool { Outer: for _, r1 := range source { for i, r2 := range target { - if fn(r1) == fn(r2) { + if r1 == r2 { target = target[i+utf8.RuneLen(r2):] continue Outer } @@ -50,19 +71,29 @@ Outer: // Find will return a list of strings in targets that fuzzy matches source. func Find(source string, targets []string) []string { - return find(source, targets, noop) + return find(source, targets, noopTransformer) } // FindFold is a case-insensitive version of Find. func FindFold(source string, targets []string) []string { - return find(source, targets, unicode.ToLower) + return find(source, targets, foldTransformer) } -func find(source string, targets []string, fn func(rune) rune) []string { +// FindNormalized is a unicode-normalized version of Find. +func FindNormalized(source string, targets []string) []string { + return find(source, targets, normalizeTransformer) +} + +// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find. +func FindNormalizedFold(source string, targets []string) []string { + return find(source, targets, normalizeFoldTransformer) +} + +func find(source string, targets []string, transformer transform.Transformer) []string { var matches []string for _, target := range targets { - if match(source, target, fn) { + if match(source, target, transformer) { matches = append(matches, target) } } @@ -77,21 +108,34 @@ func find(source string, targets []string, fn func(rune) rune) []string { // the Levenshtein calculation, only deletions need be considered, required // additions and substitutions would fail the match test. func RankMatch(source, target string) int { - return rank(source, target, noop) + return rank(source, target, noopTransformer) } // RankMatchFold is a case-insensitive version of RankMatch. func RankMatchFold(source, target string) int { - return rank(source, target, unicode.ToLower) + return rank(source, target, foldTransformer) } -func rank(source, target string, fn func(rune) rune) int { +// RankMatchNormalized is a unicode-normalized version of RankMatch. +func RankMatchNormalized(source, target string) int { + return rank(source, target, normalizeTransformer) +} + +// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch. +func RankMatchNormalizedFold(source, target string) int { + return rank(source, target, normalizeFoldTransformer) +} + +func rank(source, target string, transformer transform.Transformer) int { lenDiff := len(target) - len(source) if lenDiff < 0 { return -1 } + source = stringTransform(source, transformer) + target = stringTransform(target, transformer) + if lenDiff == 0 && source == target { return 0 } @@ -101,7 +145,7 @@ func rank(source, target string, fn func(rune) rune) int { Outer: for _, r1 := range source { for i, r2 := range target { - if fn(r1) == fn(r2) { + if r1 == r2 { target = target[i+utf8.RuneLen(r2):] continue Outer } else { @@ -120,23 +164,29 @@ Outer: // RankFind is similar to Find, except it will also rank all matches using // Levenshtein distance. func RankFind(source string, targets []string) Ranks { - var r Ranks - - for index, target := range targets { - if match(source, target, noop) { - distance := LevenshteinDistance(source, target) - r = append(r, Rank{source, target, distance, index}) - } - } - return r + return rankFind(source, targets, noopTransformer) } // RankFindFold is a case-insensitive version of RankFind. func RankFindFold(source string, targets []string) Ranks { + return rankFind(source, targets, foldTransformer) +} + +// RankFindNormalized is a unicode-normalizedversion of RankFind. +func RankFindNormalized(source string, targets []string) Ranks { + return rankFind(source, targets, normalizeTransformer) +} + +// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind. +func RankFindNormalizedFold(source string, targets []string) Ranks { + return rankFind(source, targets, normalizeFoldTransformer) +} + +func rankFind(source string, targets []string, transformer transform.Transformer) Ranks { var r Ranks for index, target := range targets { - if match(source, target, unicode.ToLower) { + if match(source, target, transformer) { distance := LevenshteinDistance(source, target) r = append(r, Rank{source, target, distance, index}) } @@ -171,3 +221,33 @@ func (r Ranks) Swap(i, j int) { func (r Ranks) Less(i, j int) bool { return r[i].Distance < r[j].Distance } + +func stringTransform(s string, t transform.Transformer) (transformed string) { + var err error + transformed, _, err = transform.String(t, s) + if err != nil { + transformed = s + } + + return +} + +type unicodeFoldTransformer struct{} + +func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + runes := bytes.Runes(src) + var lowerRunes []rune + for _, r := range runes { + lowerRunes = append(lowerRunes, unicode.ToLower(r)) + } + + srcBytes := []byte(string(lowerRunes)) + n := copy(dst, srcBytes) + if n < len(srcBytes) { + err = transform.ErrShortDst + } + + return n, n, err +} + +func (unicodeFoldTransformer) Reset() {} diff --git a/fuzzy/fuzzy_test.go b/fuzzy/fuzzy_test.go index c76545d..3a32330 100644 --- a/fuzzy/fuzzy_test.go +++ b/fuzzy/fuzzy_test.go @@ -45,6 +45,7 @@ var fuzzyTests = []struct { {"中国", "中华人民共和国", true, 5}, {"日本", "中华人民共和国", false, -1}, {"イ", "イカ", true, 1}, + {"limón", "limon", false, -1}, } func TestFuzzyMatch(t *testing.T) { @@ -67,11 +68,87 @@ func TestFuzzyMatchFold(t *testing.T) { } } +func TestFuzzyMatchNormalized(t *testing.T) { + var normalizedTests = []struct { + source string + target string + wanted bool + }{ + {"limon", "limón", true}, + {"limón", "limon tart", true}, + {"limón", "LiMóN tArT", false}, + {"limón", "LeMoN tArT", false}, + } + + for _, val := range normalizedTests { + match := MatchNormalized(val.source, val.target) + if match != val.wanted { + t.Errorf("%s in %s expected match to be %t, got %t", + val.source, val.target, val.wanted, match) + } + } +} + +func TestFuzzyMatchNormalizedFold(t *testing.T) { + var normalizedTests = []struct { + source string + target string + wanted bool + }{ + {"limon", "limón", true}, + {"limón", "limon tart", true}, + {"limón", "LiMóN tArT", true}, + {"limón", "LeMoN tArT", false}, + } + + for _, val := range normalizedTests { + match := MatchNormalizedFold(val.source, val.target) + if match != val.wanted { + t.Errorf("%s in %s expected match to be %t, got %t", + val.source, val.target, val.wanted, match) + } + } +} + func TestFuzzyFind(t *testing.T) { - target := []string{"cartwheel", "foobar", "wheel", "baz"} + target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél"} wanted := []string{"cartwheel", "wheel"} - matches := Find("whl", target) + matches := Find("whel", target) + + if len(matches) != len(wanted) { + t.Errorf("expected %s, got %s", wanted, matches) + } + + for i := range wanted { + if wanted[i] != matches[i] { + t.Errorf("expected %s, got %s", wanted, matches) + } + } +} + +func TestFuzzyFindNormalized(t *testing.T) { + target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél", "WHEEL"} + wanted := []string{"cartwheel", "wheel", "cartwhéél"} + + matches := FindNormalized("whél", target) + + if len(matches) != len(wanted) { + t.Errorf("expected %s, got %s", wanted, matches) + } + + for i := range wanted { + if wanted[i] != matches[i] { + t.Errorf("expected %s, got %s", wanted, matches) + } + } +} + +func TestFuzzyFindNormalizedFold(t *testing.T) { + target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél", "WHEEL"} + wanted := []string{"cartwheel", "wheel", "cartwhéél", "WHEEL"} + + matches := FindNormalizedFold("whél", target) if len(matches) != len(wanted) { t.Errorf("expected %s, got %s", wanted, matches) @@ -94,6 +171,47 @@ func TestRankMatch(t *testing.T) { } } +func TestRankMatchNormalized(t *testing.T) { + var fuzzyTests = []struct { + source string + target string + rank int + }{ + {"limó", "limon", 1}, + {"limó", "limon", 1}, + {"limó", "LIMON", -1}, + } + + for _, val := range fuzzyTests { + rank := RankMatchNormalized(val.source, val.target) + if rank != val.rank { + t.Errorf("expected ranking %d, got %d for %s in %s", + val.rank, rank, val.source, val.target) + } + } +} + +func TestRankMatchNormalizedFold(t *testing.T) { + var fuzzyTests = []struct { + source string + target string + rank int + }{ + {"limó", "limon", 1}, + {"limó", "limon", 1}, + {"limó", "LIMON", 1}, + {"limó", "LIMON TART", 6}, + } + + for _, val := range fuzzyTests { + rank := RankMatchNormalizedFold(val.source, val.target) + if rank != val.rank { + t.Errorf("expected ranking %d, got %d for %s in %s", + val.rank, rank, val.source, val.target) + } + } +} + func TestRankFind(t *testing.T) { target := []string{"cartwheel", "foobar", "wheel", "baz"} wanted := []Rank{ @@ -114,6 +232,47 @@ func TestRankFind(t *testing.T) { } } +func TestRankFindNormalized(t *testing.T) { + target := []string{"limón", "limon", "lemon", "LIMON"} + wanted := []Rank{ + {"limó", "limón", 1, 0}, + {"limó", "limon", 2, 1}, + } + + ranks := RankFindNormalized("limó", target) + + if len(ranks) != len(wanted) { + t.Errorf("expected %+v, got %+v", wanted, ranks) + } + + for i := range wanted { + if wanted[i] != ranks[i] { + t.Errorf("expected %+v, got %+v", wanted, ranks) + } + } +} + +func TestRankFindNormalizedFold(t *testing.T) { + target := []string{"limón", "limon", "lemon", "LIMON"} + wanted := []Rank{ + {"limó", "limón", 1, 0}, + {"limó", "limon", 2, 1}, + {"limó", "LIMON", 5, 3}, + } + + ranks := RankFindNormalizedFold("limó", target) + + if len(ranks) != len(wanted) { + t.Errorf("expected %+v, got %+v", wanted, ranks) + } + + for i := range wanted { + if wanted[i] != ranks[i] { + t.Errorf("expected %+v, got %+v", wanted, ranks) + } + } +} + func TestSortingRanks(t *testing.T) { rs := Ranks{{"a", "b", 1, 0}, {"a", "cc", 2, 1}, {"a", "a", 0, 2}} wanted := Ranks{rs[2], rs[0], rs[1]} diff --git a/go.mod b/go.mod index 51a8c9a..b036d7a 100644 --- a/go.mod +++ b/go.mod @@ -1 +1,3 @@ module github.com/lithammer/fuzzysearch + +require golang.org/x/text v0.3.2 From af4758a10c10e407b71578da56a66723adf6c9a2 Mon Sep 17 00:00:00 2001 From: Adriano Caloiaro Date: Mon, 9 Dec 2019 08:59:45 -0500 Subject: [PATCH 2/2] Run test suite on pull requests --- .github/workflows/go.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 5d75d1b..9cba815 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -1,5 +1,5 @@ name: Go -on: [push] +on: [push, pull_request] jobs: test: strategy: