Skip to content

Commit

Permalink
Merge pull request #16 from acaloiaro/add-unicode-normalization
Browse files Browse the repository at this point in the history
Add normalized and normalized-fold varieties of match/find/rank
  • Loading branch information
lithammer authored Dec 9, 2019
2 parents 9d9e791 + af4758a commit 3120a96
Show file tree
Hide file tree
Showing 4 changed files with 267 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: Go
on: [push]
on: [push, pull_request]
jobs:
test:
strategy:
Expand Down
126 changes: 103 additions & 23 deletions fuzzy/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,48 @@
package fuzzy

import (
"bytes"
"unicode"
"unicode/utf8"

"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)

var noop = func(r rune) rune { return r }
var foldTransformer = unicodeFoldTransformer{}
var noopTransformer = transform.Nop
var normalizeTransformer = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
var normalizeFoldTransformer = transform.Chain(normalizeTransformer, foldTransformer)

// Match returns true if source matches target using a fuzzy-searching
// algorithm. Note that it doesn't implement Levenshtein distance (see
// RankMatch instead), but rather a simplified version where there's no
// approximation. The method will return true only if each character in the
// source can be found in the target and occurs after the preceding matches.
func Match(source, target string) bool {
return match(source, target, noop)
return match(source, target, noopTransformer)
}

// MatchFold is a case-insensitive version of Match.
func MatchFold(source, target string) bool {
return match(source, target, unicode.ToLower)
return match(source, target, foldTransformer)
}

// MatchNormalized is a unicode-normalized version of Match.
func MatchNormalized(source, target string) bool {
return match(source, target, normalizeTransformer)
}

func match(source, target string, fn func(rune) rune) bool {
// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match.
func MatchNormalizedFold(source, target string) bool {
return match(source, target, normalizeFoldTransformer)
}

func match(source, target string, transformer transform.Transformer) bool {
source = stringTransform(source, transformer)
target = stringTransform(target, transformer)

lenDiff := len(target) - len(source)

if lenDiff < 0 {
Expand All @@ -37,7 +58,7 @@ func match(source, target string, fn func(rune) rune) bool {
Outer:
for _, r1 := range source {
for i, r2 := range target {
if fn(r1) == fn(r2) {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
}
Expand All @@ -50,19 +71,29 @@ Outer:

// Find will return a list of strings in targets that fuzzy matches source.
func Find(source string, targets []string) []string {
return find(source, targets, noop)
return find(source, targets, noopTransformer)
}

// FindFold is a case-insensitive version of Find.
func FindFold(source string, targets []string) []string {
return find(source, targets, unicode.ToLower)
return find(source, targets, foldTransformer)
}

func find(source string, targets []string, fn func(rune) rune) []string {
// FindNormalized is a unicode-normalized version of Find.
func FindNormalized(source string, targets []string) []string {
return find(source, targets, normalizeTransformer)
}

// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find.
func FindNormalizedFold(source string, targets []string) []string {
return find(source, targets, normalizeFoldTransformer)
}

func find(source string, targets []string, transformer transform.Transformer) []string {
var matches []string

for _, target := range targets {
if match(source, target, fn) {
if match(source, target, transformer) {
matches = append(matches, target)
}
}
Expand All @@ -77,21 +108,34 @@ func find(source string, targets []string, fn func(rune) rune) []string {
// the Levenshtein calculation, only deletions need be considered, required
// additions and substitutions would fail the match test.
func RankMatch(source, target string) int {
return rank(source, target, noop)
return rank(source, target, noopTransformer)
}

// RankMatchFold is a case-insensitive version of RankMatch.
func RankMatchFold(source, target string) int {
return rank(source, target, unicode.ToLower)
return rank(source, target, foldTransformer)
}

func rank(source, target string, fn func(rune) rune) int {
// RankMatchNormalized is a unicode-normalized version of RankMatch.
func RankMatchNormalized(source, target string) int {
return rank(source, target, normalizeTransformer)
}

// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch.
func RankMatchNormalizedFold(source, target string) int {
return rank(source, target, normalizeFoldTransformer)
}

func rank(source, target string, transformer transform.Transformer) int {
lenDiff := len(target) - len(source)

if lenDiff < 0 {
return -1
}

source = stringTransform(source, transformer)
target = stringTransform(target, transformer)

if lenDiff == 0 && source == target {
return 0
}
Expand All @@ -101,7 +145,7 @@ func rank(source, target string, fn func(rune) rune) int {
Outer:
for _, r1 := range source {
for i, r2 := range target {
if fn(r1) == fn(r2) {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
} else {
Expand All @@ -120,23 +164,29 @@ Outer:
// RankFind is similar to Find, except it will also rank all matches using
// Levenshtein distance.
func RankFind(source string, targets []string) Ranks {
var r Ranks

for index, target := range targets {
if match(source, target, noop) {
distance := LevenshteinDistance(source, target)
r = append(r, Rank{source, target, distance, index})
}
}
return r
return rankFind(source, targets, noopTransformer)
}

// RankFindFold is a case-insensitive version of RankFind.
func RankFindFold(source string, targets []string) Ranks {
return rankFind(source, targets, foldTransformer)
}

// RankFindNormalized is a unicode-normalizedversion of RankFind.
func RankFindNormalized(source string, targets []string) Ranks {
return rankFind(source, targets, normalizeTransformer)
}

// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind.
func RankFindNormalizedFold(source string, targets []string) Ranks {
return rankFind(source, targets, normalizeFoldTransformer)
}

func rankFind(source string, targets []string, transformer transform.Transformer) Ranks {
var r Ranks

for index, target := range targets {
if match(source, target, unicode.ToLower) {
if match(source, target, transformer) {
distance := LevenshteinDistance(source, target)
r = append(r, Rank{source, target, distance, index})
}
Expand Down Expand Up @@ -171,3 +221,33 @@ func (r Ranks) Swap(i, j int) {
func (r Ranks) Less(i, j int) bool {
return r[i].Distance < r[j].Distance
}

func stringTransform(s string, t transform.Transformer) (transformed string) {
var err error
transformed, _, err = transform.String(t, s)
if err != nil {
transformed = s
}

return
}

type unicodeFoldTransformer struct{}

func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
runes := bytes.Runes(src)
var lowerRunes []rune
for _, r := range runes {
lowerRunes = append(lowerRunes, unicode.ToLower(r))
}

srcBytes := []byte(string(lowerRunes))
n := copy(dst, srcBytes)
if n < len(srcBytes) {
err = transform.ErrShortDst
}

return n, n, err
}

func (unicodeFoldTransformer) Reset() {}
Loading

0 comments on commit 3120a96

Please sign in to comment.