From 2823642c1233f68fda558769d7f9f40ed3eeaf5c Mon Sep 17 00:00:00 2001
From: Adriano Caloiaro <adriano@caloiaro.com>
Date: Sat, 7 Dec 2019 17:56:41 -0500
Subject: [PATCH 1/2] Add normalized and normalized-fold varieties of
 match/find/rank

---
 fuzzy/fuzzy.go      | 126 +++++++++++++++++++++++++++-------
 fuzzy/fuzzy_test.go | 163 +++++++++++++++++++++++++++++++++++++++++++-
 go.mod              |   2 +
 3 files changed, 266 insertions(+), 25 deletions(-)

diff --git a/fuzzy/fuzzy.go b/fuzzy/fuzzy.go
index 33d4898..1751372 100644
--- a/fuzzy/fuzzy.go
+++ b/fuzzy/fuzzy.go
@@ -3,11 +3,19 @@
 package fuzzy
 
 import (
+	"bytes"
 	"unicode"
 	"unicode/utf8"
+
+	"golang.org/x/text/runes"
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
 )
 
-var noop = func(r rune) rune { return r }
+var foldTransformer = unicodeFoldTransformer{}
+var noopTransformer = transform.Nop
+var normalizeTransformer = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+var normalizeFoldTransformer = transform.Chain(normalizeTransformer, foldTransformer)
 
 // Match returns true if source matches target using a fuzzy-searching
 // algorithm. Note that it doesn't implement Levenshtein distance (see
@@ -15,15 +23,28 @@ var noop = func(r rune) rune { return r }
 // approximation. The method will return true only if each character in the
 // source can be found in the target and occurs after the preceding matches.
 func Match(source, target string) bool {
-	return match(source, target, noop)
+	return match(source, target, noopTransformer)
 }
 
 // MatchFold is a case-insensitive version of Match.
 func MatchFold(source, target string) bool {
-	return match(source, target, unicode.ToLower)
+	return match(source, target, foldTransformer)
+}
+
+// MatchNormalized is a unicode-normalized version of Match.
+func MatchNormalized(source, target string) bool {
+	return match(source, target, normalizeTransformer)
 }
 
-func match(source, target string, fn func(rune) rune) bool {
+// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match.
+func MatchNormalizedFold(source, target string) bool {
+	return match(source, target, normalizeFoldTransformer)
+}
+
+func match(source, target string, transformer transform.Transformer) bool {
+	source = stringTransform(source, transformer)
+	target = stringTransform(target, transformer)
+
 	lenDiff := len(target) - len(source)
 
 	if lenDiff < 0 {
@@ -37,7 +58,7 @@ func match(source, target string, fn func(rune) rune) bool {
 Outer:
 	for _, r1 := range source {
 		for i, r2 := range target {
-			if fn(r1) == fn(r2) {
+			if r1 == r2 {
 				target = target[i+utf8.RuneLen(r2):]
 				continue Outer
 			}
@@ -50,19 +71,29 @@ Outer:
 
 // Find will return a list of strings in targets that fuzzy matches source.
 func Find(source string, targets []string) []string {
-	return find(source, targets, noop)
+	return find(source, targets, noopTransformer)
 }
 
 // FindFold is a case-insensitive version of Find.
 func FindFold(source string, targets []string) []string {
-	return find(source, targets, unicode.ToLower)
+	return find(source, targets, foldTransformer)
 }
 
-func find(source string, targets []string, fn func(rune) rune) []string {
+// FindNormalized is a unicode-normalized version of Find.
+func FindNormalized(source string, targets []string) []string {
+	return find(source, targets, normalizeTransformer)
+}
+
+// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find.
+func FindNormalizedFold(source string, targets []string) []string {
+	return find(source, targets, normalizeFoldTransformer)
+}
+
+func find(source string, targets []string, transformer transform.Transformer) []string {
 	var matches []string
 
 	for _, target := range targets {
-		if match(source, target, fn) {
+		if match(source, target, transformer) {
 			matches = append(matches, target)
 		}
 	}
@@ -77,21 +108,34 @@ func find(source string, targets []string, fn func(rune) rune) []string {
 // the Levenshtein calculation, only deletions need be considered, required
 // additions and substitutions would fail the match test.
 func RankMatch(source, target string) int {
-	return rank(source, target, noop)
+	return rank(source, target, noopTransformer)
 }
 
 // RankMatchFold is a case-insensitive version of RankMatch.
 func RankMatchFold(source, target string) int {
-	return rank(source, target, unicode.ToLower)
+	return rank(source, target, foldTransformer)
 }
 
-func rank(source, target string, fn func(rune) rune) int {
+// RankMatchNormalized is a unicode-normalized version of RankMatch.
+func RankMatchNormalized(source, target string) int {
+	return rank(source, target, normalizeTransformer)
+}
+
+// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch.
+func RankMatchNormalizedFold(source, target string) int {
+	return rank(source, target, normalizeFoldTransformer)
+}
+
+func rank(source, target string, transformer transform.Transformer) int {
 	lenDiff := len(target) - len(source)
 
 	if lenDiff < 0 {
 		return -1
 	}
 
+	source = stringTransform(source, transformer)
+	target = stringTransform(target, transformer)
+
 	if lenDiff == 0 && source == target {
 		return 0
 	}
@@ -101,7 +145,7 @@ func rank(source, target string, fn func(rune) rune) int {
 Outer:
 	for _, r1 := range source {
 		for i, r2 := range target {
-			if fn(r1) == fn(r2) {
+			if r1 == r2 {
 				target = target[i+utf8.RuneLen(r2):]
 				continue Outer
 			} else {
@@ -120,23 +164,29 @@ Outer:
 // RankFind is similar to Find, except it will also rank all matches using
 // Levenshtein distance.
 func RankFind(source string, targets []string) Ranks {
-	var r Ranks
-
-	for index, target := range targets {
-		if match(source, target, noop) {
-			distance := LevenshteinDistance(source, target)
-			r = append(r, Rank{source, target, distance, index})
-		}
-	}
-	return r
+	return rankFind(source, targets, noopTransformer)
 }
 
 // RankFindFold is a case-insensitive version of RankFind.
 func RankFindFold(source string, targets []string) Ranks {
+	return rankFind(source, targets, foldTransformer)
+}
+
+// RankFindNormalized is a unicode-normalizedversion of RankFind.
+func RankFindNormalized(source string, targets []string) Ranks {
+	return rankFind(source, targets, normalizeTransformer)
+}
+
+// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind.
+func RankFindNormalizedFold(source string, targets []string) Ranks {
+	return rankFind(source, targets, normalizeFoldTransformer)
+}
+
+func rankFind(source string, targets []string, transformer transform.Transformer) Ranks {
 	var r Ranks
 
 	for index, target := range targets {
-		if match(source, target, unicode.ToLower) {
+		if match(source, target, transformer) {
 			distance := LevenshteinDistance(source, target)
 			r = append(r, Rank{source, target, distance, index})
 		}
@@ -171,3 +221,33 @@ func (r Ranks) Swap(i, j int) {
 func (r Ranks) Less(i, j int) bool {
 	return r[i].Distance < r[j].Distance
 }
+
+func stringTransform(s string, t transform.Transformer) (transformed string) {
+	var err error
+	transformed, _, err = transform.String(t, s)
+	if err != nil {
+		transformed = s
+	}
+
+	return
+}
+
+type unicodeFoldTransformer struct{}
+
+func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	runes := bytes.Runes(src)
+	var lowerRunes []rune
+	for _, r := range runes {
+		lowerRunes = append(lowerRunes, unicode.ToLower(r))
+	}
+
+	srcBytes := []byte(string(lowerRunes))
+	n := copy(dst, srcBytes)
+	if n < len(srcBytes) {
+		err = transform.ErrShortDst
+	}
+
+	return n, n, err
+}
+
+func (unicodeFoldTransformer) Reset() {}
diff --git a/fuzzy/fuzzy_test.go b/fuzzy/fuzzy_test.go
index c76545d..3a32330 100644
--- a/fuzzy/fuzzy_test.go
+++ b/fuzzy/fuzzy_test.go
@@ -45,6 +45,7 @@ var fuzzyTests = []struct {
 	{"中国", "中华人民共和国", true, 5},
 	{"日本", "中华人民共和国", false, -1},
 	{"イ", "イカ", true, 1},
+	{"limón", "limon", false, -1},
 }
 
 func TestFuzzyMatch(t *testing.T) {
@@ -67,11 +68,87 @@ func TestFuzzyMatchFold(t *testing.T) {
 	}
 }
 
+func TestFuzzyMatchNormalized(t *testing.T) {
+	var normalizedTests = []struct {
+		source string
+		target string
+		wanted bool
+	}{
+		{"limon", "limón", true},
+		{"limón", "limon tart", true},
+		{"limón", "LiMóN tArT", false},
+		{"limón", "LeMoN tArT", false},
+	}
+
+	for _, val := range normalizedTests {
+		match := MatchNormalized(val.source, val.target)
+		if match != val.wanted {
+			t.Errorf("%s in %s expected match to be %t, got %t",
+				val.source, val.target, val.wanted, match)
+		}
+	}
+}
+
+func TestFuzzyMatchNormalizedFold(t *testing.T) {
+	var normalizedTests = []struct {
+		source string
+		target string
+		wanted bool
+	}{
+		{"limon", "limón", true},
+		{"limón", "limon tart", true},
+		{"limón", "LiMóN tArT", true},
+		{"limón", "LeMoN tArT", false},
+	}
+
+	for _, val := range normalizedTests {
+		match := MatchNormalizedFold(val.source, val.target)
+		if match != val.wanted {
+			t.Errorf("%s in %s expected match to be %t, got %t",
+				val.source, val.target, val.wanted, match)
+		}
+	}
+}
+
 func TestFuzzyFind(t *testing.T) {
-	target := []string{"cartwheel", "foobar", "wheel", "baz"}
+	target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél"}
 	wanted := []string{"cartwheel", "wheel"}
 
-	matches := Find("whl", target)
+	matches := Find("whel", target)
+
+	if len(matches) != len(wanted) {
+		t.Errorf("expected %s, got %s", wanted, matches)
+	}
+
+	for i := range wanted {
+		if wanted[i] != matches[i] {
+			t.Errorf("expected %s, got %s", wanted, matches)
+		}
+	}
+}
+
+func TestFuzzyFindNormalized(t *testing.T) {
+	target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél", "WHEEL"}
+	wanted := []string{"cartwheel", "wheel", "cartwhéél"}
+
+	matches := FindNormalized("whél", target)
+
+	if len(matches) != len(wanted) {
+		t.Errorf("expected %s, got %s", wanted, matches)
+	}
+
+	for i := range wanted {
+		if wanted[i] != matches[i] {
+			t.Errorf("expected %s, got %s", wanted, matches)
+		}
+	}
+}
+
+func TestFuzzyFindNormalizedFold(t *testing.T) {
+	target := []string{"cartwheel", "foobar", "wheel", "baz", "cartwhéél", "WHEEL"}
+	wanted := []string{"cartwheel", "wheel", "cartwhéél", "WHEEL"}
+
+	matches := FindNormalizedFold("whél", target)
 
 	if len(matches) != len(wanted) {
 		t.Errorf("expected %s, got %s", wanted, matches)
@@ -94,6 +171,47 @@ func TestRankMatch(t *testing.T) {
 	}
 }
 
+func TestRankMatchNormalized(t *testing.T) {
+	var fuzzyTests = []struct {
+		source string
+		target string
+		rank   int
+	}{
+		{"limó", "limon", 1},
+		{"limó", "limon", 1},
+		{"limó", "LIMON", -1},
+	}
+
+	for _, val := range fuzzyTests {
+		rank := RankMatchNormalized(val.source, val.target)
+		if rank != val.rank {
+			t.Errorf("expected ranking %d, got %d for %s in %s",
+				val.rank, rank, val.source, val.target)
+		}
+	}
+}
+
+func TestRankMatchNormalizedFold(t *testing.T) {
+	var fuzzyTests = []struct {
+		source string
+		target string
+		rank   int
+	}{
+		{"limó", "limon", 1},
+		{"limó", "limon", 1},
+		{"limó", "LIMON", 1},
+		{"limó", "LIMON TART", 6},
+	}
+
+	for _, val := range fuzzyTests {
+		rank := RankMatchNormalizedFold(val.source, val.target)
+		if rank != val.rank {
+			t.Errorf("expected ranking %d, got %d for %s in %s",
+				val.rank, rank, val.source, val.target)
+		}
+	}
+}
+
 func TestRankFind(t *testing.T) {
 	target := []string{"cartwheel", "foobar", "wheel", "baz"}
 	wanted := []Rank{
@@ -114,6 +232,47 @@ func TestRankFind(t *testing.T) {
 	}
 }
 
+func TestRankFindNormalized(t *testing.T) {
+	target := []string{"limón", "limon", "lemon", "LIMON"}
+	wanted := []Rank{
+		{"limó", "limón", 1, 0},
+		{"limó", "limon", 2, 1},
+	}
+
+	ranks := RankFindNormalized("limó", target)
+
+	if len(ranks) != len(wanted) {
+		t.Errorf("expected %+v, got %+v", wanted, ranks)
+	}
+
+	for i := range wanted {
+		if wanted[i] != ranks[i] {
+			t.Errorf("expected %+v, got %+v", wanted, ranks)
+		}
+	}
+}
+
+func TestRankFindNormalizedFold(t *testing.T) {
+	target := []string{"limón", "limon", "lemon", "LIMON"}
+	wanted := []Rank{
+		{"limó", "limón", 1, 0},
+		{"limó", "limon", 2, 1},
+		{"limó", "LIMON", 5, 3},
+	}
+
+	ranks := RankFindNormalizedFold("limó", target)
+
+	if len(ranks) != len(wanted) {
+		t.Errorf("expected %+v, got %+v", wanted, ranks)
+	}
+
+	for i := range wanted {
+		if wanted[i] != ranks[i] {
+			t.Errorf("expected %+v, got %+v", wanted, ranks)
+		}
+	}
+}
+
 func TestSortingRanks(t *testing.T) {
 	rs := Ranks{{"a", "b", 1, 0}, {"a", "cc", 2, 1}, {"a", "a", 0, 2}}
 	wanted := Ranks{rs[2], rs[0], rs[1]}
diff --git a/go.mod b/go.mod
index 51a8c9a..b036d7a 100644
--- a/go.mod
+++ b/go.mod
@@ -1 +1,3 @@
 module github.com/lithammer/fuzzysearch
+
+require golang.org/x/text v0.3.2

From af4758a10c10e407b71578da56a66723adf6c9a2 Mon Sep 17 00:00:00 2001
From: Adriano Caloiaro <adriano@caloiaro.com>
Date: Mon, 9 Dec 2019 08:59:45 -0500
Subject: [PATCH 2/2] Run test suite on pull requests

---
 .github/workflows/go.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 5d75d1b..9cba815 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -1,5 +1,5 @@
 name: Go
-on: [push]
+on: [push, pull_request]
 jobs:
   test:
     strategy: