Skip to content

Commit

Permalink
Add Prefix, Suffix and Ngram UDFs (#12392)
Browse files Browse the repository at this point in the history
  • Loading branch information
deemoliu authored Apr 23, 2024
1 parent 0caeccf commit 36c4b9a
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.pinot.common.function.scalar;

import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
Expand All @@ -28,6 +30,7 @@
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.commons.lang3.StringUtils;
import org.apache.pinot.common.utils.RegexpPatternConverterUtils;
import org.apache.pinot.spi.annotations.ScalarFunction;
Expand Down Expand Up @@ -580,6 +583,111 @@ public static String[] split(String input, String delimiter, int limit) {
return StringUtils.splitByWholeSeparator(input, delimiter, limit);
}

/**
* @param input an input string for prefix strings generations.
* @param maxlength the max length of the prefix strings for the string.
* @return generate an array of prefix strings of the string that are shorter than the specified length.
*/
@ScalarFunction
public static String[] prefixes(String input, int maxlength) {
int arrLength = Math.min(maxlength, input.length());
String[] prefixArr = new String[arrLength];
for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
prefixArr[prefixIdx - 1] = input.substring(0, prefixIdx);
}
return prefixArr;
}

/**
* @param input an input string for prefix strings generations.
* @param maxlength the max length of the prefix strings for the string.
* @param prefix the prefix to be prepended to prefix strings generated. e.g. '^' for regex matching
* @return generate an array of prefix matchers of the string that are shorter than the specified length.
*/
@ScalarFunction(nullableParameters = true, names = {"prefixesWithPrefix", "prefixes_with_prefix"})
public static String[] prefixesWithPrefix(String input, int maxlength, @Nullable String prefix) {
if (prefix == null) {
return prefixes(input, maxlength);
}
int arrLength = Math.min(maxlength, input.length());
String[] prefixArr = new String[arrLength];
for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
prefixArr[prefixIdx - 1] = prefix + input.substring(0, prefixIdx);
}
return prefixArr;
}

/**
* @param input an input string for suffix strings generations.
* @param maxlength the max length of the suffix strings for the string.
* @return generate an array of suffix strings of the string that are shorter than the specified length.
*/
@ScalarFunction
public static String[] suffixes(String input, int maxlength) {
int arrLength = Math.min(maxlength, input.length());
String[] suffixArr = new String[arrLength];
for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx);
}
return suffixArr;
}

/**
* @param input an input string for suffix strings generations.
* @param maxlength the max length of the suffix strings for the string.
* @param suffix the suffix string to be appended for suffix strings generated. e.g. '$' for regex matching.
* @return generate an array of suffix matchers of the string that are shorter than the specified length.
*/
@ScalarFunction(nullableParameters = true, names = {"suffixesWithSuffix", "suffixes_with_suffix"})
public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable String suffix) {
if (suffix == null) {
return suffixes(input, maxlength);
}
int arrLength = Math.min(maxlength, input.length());
String[] suffixArr = new String[arrLength];
for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx) + suffix;
}
return suffixArr;
}

/**
* @param input an input string for ngram generations.
* @param length the max length of the ngram for the string.
* @return generate an array of unique ngram of the string that length are exactly matching the specified length.
*/
@ScalarFunction
public static String[] uniqueNgrams(String input, int length) {
if (length == 0 || length > input.length()) {
return new String[0];
}
ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
for (int i = 0; i < input.length() - length + 1; i++) {
ngramSet.add(input.substring(i, i + length));
}
return ngramSet.toArray(new String[0]);
}

/**
* @param input an input string for ngram generations.
* @param minGram the min length of the ngram for the string.
* @param maxGram the max length of the ngram for the string.
* @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram].
*/
@ScalarFunction
public static String[] uniqueNgrams(String input, int minGram, int maxGram) {
ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
for (int n = minGram; n <= maxGram && n <= input.length(); n++) {
if (n == 0) {
continue;
}
for (int i = 0; i < input.length() - n + 1; i++) {
ngramSet.add(input.substring(i, i + n));
}
}
return ngramSet.toArray(new String[0]);
}

/**
* TODO: Revisit if index should be one-based (both Presto and Postgres use one-based index, which starts with 1)
* @param input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,41 @@ public static Object[][] isJsonTestCases() {
};
}

@DataProvider(name = "prefixAndSuffixTestCases")
public static Object[][] prefixAndSuffixTestCases() {
return new Object[][]{
{"abcde", 3, new String[]{"a", "ab", "abc"}, new String[]{"e", "de", "cde"}, new String[]{
"^a", "^ab", "^abc"}, new String[]{"e$", "de$", "cde$"}},
{"abcde", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"abcde", 9, new String[]{"a", "ab", "abc", "abcd", "abcde"}, new String[]{"e", "de", "cde", "bcde", "abcde"},
new String[]{"^a", "^ab", "^abc", "^abcd", "^abcde"}, new String[]{"e$", "de$", "cde$", "bcde$", "abcde$"}},
{"a", 3, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
{"a", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"a", 9, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
{"", 3, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"", 9, new String[]{}, new String[]{}, new String[]{}, new String[]{}}
};
}

@DataProvider(name = "ngramTestCases")
public static Object[][] ngramTestCases() {
return new Object[][]{
{"abcd", 0, 3, new String[]{"abc", "bcd"}, new String[]{"a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"}},
{"abcd", 2, 2, new String[]{"ab", "bc", "cd"}, new String[]{"ab", "bc", "cd"}},
{"abcd", 3, 0, new String[]{}, new String[]{}},
{"abc", 0, 3, new String[]{"abc"}, new String[]{"a", "b", "c", "ab", "bc", "abc"}},
{"abc", 3, 0, new String[]{}, new String[]{}},
{"abc", 3, 3, new String[]{"abc"}, new String[]{"abc"}},
{"a", 0, 3, new String[]{}, new String[]{"a"}},
{"a", 2, 3, new String[]{}, new String[]{}},
{"a", 3, 3, new String[]{}, new String[]{}},
{"", 3, 0, new String[]{}, new String[]{}},
{"", 3, 3, new String[]{}, new String[]{}},
{"", 0, 3, new String[]{}, new String[]{}}
};
}

@Test(dataProvider = "isJson")
public void testIsJson(String input, boolean expectedValue) {
assertEquals(StringFunctions.isJson(input), expectedValue);
Expand All @@ -88,4 +123,19 @@ public void testSplitPart(String input, String delimiter, int index, int limit,
assertEquals(StringFunctions.splitPart(input, delimiter, index), expectedToken);
assertEquals(StringFunctions.splitPart(input, delimiter, limit, index), expectedTokenWithLimitCounts);
}

@Test(dataProvider = "prefixAndSuffixTestCases")
public void testPrefixAndSuffix(String input, int length, String[] expectedPrefix, String[] expectedSuffix,
String[] expectedPrefixWithRegexChar, String[] expectedSuffixWithRegexChar) {
assertEquals(StringFunctions.prefixes(input, length), expectedPrefix);
assertEquals(StringFunctions.suffixes(input, length), expectedSuffix);
assertEquals(StringFunctions.prefixesWithPrefix(input, length, "^"), expectedPrefixWithRegexChar);
assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"), expectedSuffixWithRegexChar);
}

@Test(dataProvider = "ngramTestCases")
public void testNGram(String input, int minGram, int maxGram, String[] expectedExactNGram, String[] expectedNGram) {
assertEquals(StringFunctions.uniqueNgrams(input, maxGram), expectedExactNGram);
assertEquals(StringFunctions.uniqueNgrams(input, minGram, maxGram), expectedNGram);
}
}

0 comments on commit 36c4b9a

Please sign in to comment.