-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathStringTools.ecl
200 lines (161 loc) · 7.62 KB
/
StringTools.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
IMPORT std;
EXPORT StringTools := MODULE
EXPORT regexLoopRec := {STRING Regex; STRING repl};
EXPORT regexLoop(STRING inStr, DATASET(regexLoopRec) regexDS, BOOLEAN noCaseRegex = TRUE, BOOLEAN TidyToo = TRUE) := FUNCTION
/**
* Loops through two sets and conducts a number of regex substitutions.
* Takes a database containing regex and replacement (see above RECORD, RegexLoopRec).
*
* Note this NOCASES by default.
*
* @param inStr text to be replaced
* @param regexDS See above recordset (RegexLoopRec). List of from and to strings, from can be a regex and replacement can contain capturegroups.
* @param noCaseRegex Should the regex be conducted with nocase? Defaults to TRUE
* @param TidyToo Should the strings be lowercased and trimmed left and right before commencing? Defaults to TRUE
*
* @return String with all regexs applied in order
*/
LOCAL aString := IF(TidyToo, TRIM(std.Str.ToLowerCase(inStr), LEFT, RIGHT), inStr);
LOCAL regexDSBlankRow := DATASET([{' ',' '}], regexLoopRec);
LOCAL regexDSconcat := regexDSBlankRow + regexDS;
LOCAL inDSaddCol := PROJECT(regexDSconcat, TRANSFORM({RECORDOF(LEFT); STRING outString;}, SELF.outString := aString; SELF := LEFT;));
LOCAL outDS := ROLLUP(inDSaddCol, TRUE,
TRANSFORM(RECORDOF(LEFT),
SELF.outString := IF(nocaseRegex,
REGEXREPLACE(RIGHT.regex, LEFT.outString, RIGHT.repl, NOCASE),
REGEXREPLACE(RIGHT.regex, LEFT.outString, RIGHT.repl));
SELF := RIGHT;));
LOCAL outStr := SET(outDS, outString)[1];
RETURN outStr;
END;
EXPORT LongestWord (STRING InWords, STRING seperator = ' ') := FUNCTION
/**
* Takes a multi word string and returns just the longest word
*
* @param InWords collection of words
* @param seperator word separator, defaults to space
*
* @return String of the longest word
*/
SplitWords := STD.Str.SplitWords(InWords, seperator);
WordDS := DATASET(SplitWords, {STRING words}); //Convert to DS
GetLen := PROJECT(WordDS,
TRANSFORM({INTEGER Len; STRING words;},
SELF.Len := LENGTH(LEFT.words);
SELF := LEFT;));
OrderedWords := SET(SORT(GetLen, -Len, Words), Words); //order by length descending, convert to set
outString := IF(STD.Str.Contains(InWords, ' ', TRUE) //if a multi-word string
, OrderedWords[1] //returnLongest
, InWords); //else return whole string
RETURN outString;
END;
EXPORT NumberSpacing (STRING InWords) := FUNCTION
/**
* Helps to create regex matching strings by allowing optional spaces between numbers.
* Also controls for presence of hyphens.
*
* @param InWords Text to be modified
*
* @return text with optional regex spaces between numbers
*/
ExtNumbers := REGEXREPLACE('([0-9])' , InWords , '[ ]?$1[ ]?');
noHyph := REGEXREPLACE('[ ]?-[ ]?' , ExtNumbers , '[ ]?');
DoubleSpace1 := REGEXREPLACE(' \\[ \\]\\?', noHyph , '[ ]?');
DoubleSpace2 := REGEXREPLACE('\\[ \\]\\? ', DoubleSpace1, '[ ]?');
RETURN DoubleSpace2;
END;
EXPORT ShortestWordDistance (STRING inString1, STRING inString2) := FUNCTION
/**
* Does a pairwise comparison of all words in each string,
* returns the shortest distance between any two words.
*
* @param inString1 Text to be compared 1
* @param inString2 Text to be compared 2
*
* @return text of closest word present in both. Or '' if none
*/
//Extract must have's first as cannot be matching on two word strings and cannot be considering numbers as equal to letters.
split1 := DATASET(STD.Str.SplitWords(inString1, ' '), {STRING words;});
split2 := DATASET(STD.Str.SplitWords(inString2, ' '), {STRING words;});
// split1Proj := tt.append(split1, UNSIGNED1, match, 1);
split1Proj := PROJECT(split1, TRANSFORM({RECORDOF(LEFT); INTEGER match;}, SELF.match := 1; SELF := LEFT;));
split2Proj := PROJECT(split2, TRANSFORM({RECORDOF(LEFT); INTEGER match;}, SELF.match := 1; SELF := LEFT;));
// split2Proj := tt.append(split2, UNSIGNED1, match, 1);
Allcomparisons := JOIN(split1Proj, split2Proj,
LEFT.match = RIGHT.match,
TRANSFORM({STRING words1; STRING words2; INTEGER distance;},
SELF.distance := STD.Str.EditDistance(LEFT.words, RIGHT.words);
SELF.words1 := LEFT.words;
SELF.words2 := RIGHT.words;), ALL);
SortedAllComparisons := SORT(AllComparisons, distance);
ShortestMatch := (INTEGER) SET(SortedAllComparisons, distance)[1];
RETURN ShortestMatch;
END;
EXPORT allWordsPresentRegex (STRING aStr, STRING sep = ' ') := FUNCTION
/**
* Create a regex that takes each word in the input string and
* states 'all these must be present to match'
*
* @param aStr Text to be converted
* @param sep word seperator, defaults to ' '
*
* @return Regex that will find all words in a string in any order
*/
aStr1 := REGEXREPLACE(sep, aStr, '\\\\b)(?=.*\\\\b');
aStr2 := '^(?=.*\\b' + aStr1 + '\\b).*$';
RETURN aStr2;
END;
EXPORT makeBOW(STRING aStr, STRING sep = ' ') := FUNCTION
/**
* Generates a unique, alphabetised word list from a string.
*
* @param aStr Text to be converted
* @param sep word seperator, defaults to ' '
*
* @return an alphabetised list of all words present.
*
* TODO: SHOULD BE A MACRO, THIS IS CONVOLUTED.
*/
lower := std.str.tolowercase(aStr);
noPunct := REGEXREPLACE('[^0-9a-z]', lower, ' ');
oneSpac := REGEXREPLACE('\\s+', noPunct, ' ');
splits := STD.Str.SplitWords(oneSpac, sep);
splitsDS := DATASET(splits, {STRING words});
unqiueSplits := DEDUP(SORT(splitsDS, words), words);
wordList := ITERATE(unqiueSplits, TRANSFORM({STRING words}, SELF.words := LEFT.words + ' ' + RIGHT.words));
wordsOut := SET(wordList, words)[count(wordList)];
RETURN TRIM(wordsOut, LEFT, RIGHT );
END;
EXPORT regexLoopOld(inStr, regex, replacement) := FUNCTIONMACRO
/**
* DEPRICATION WARNING Use new version (at top of this module!)
*
* Loops through two sets and conducts a number of regex substitutions.
* Takes two Sets as regex and replacement.
*
* @param inStr a string to correct
* @param regex a set containing regex statements to sub
* @param replacement what to sub the regex statements with
*
* @return string with all regexs applied in order
*/
IMPORT std;
// aString := IF(TidyToo, std.Str.ToLowerCase(inStr), inStr);
#DECLARE(regexI);
#DECLARE(replaceI);
#DECLARE(outStr); #SET(outStr, inStr);
#DECLARE(I); #SET(I, 1);
#DECLARE(Nmax); #SET(nmax, COUNT(regex));
//need a warning for different sized sets
#LOOP
#SET(regexI, regex[%I%]);
#SET(replaceI, replacement[%I%]);
#SET(outStr, REGEXREPLACE(%'regexI'%, %'outStr'%, %'replaceI'%, NOCASE));
// OUTPUT(%'outStr'%);
#SET(I, 1 + %I%);
#IF(%I% > %Nmax%); #BREAK #END;
#END
// FinalOutStr := IF(TidyToo, TRIM(%'outStr'%, LEFT, RIGHT), %'outStr'%);
RETURN %'outStr'%;
ENDMACRO;
END;