From 36c4b9a86fcab77e96cb1e90b1900efca0e1ce7c Mon Sep 17 00:00:00 2001 From: deemoliu Date: Tue, 23 Apr 2024 15:20:35 -0700 Subject: [PATCH] Add Prefix, Suffix and Ngram UDFs (#12392) --- .../function/scalar/StringFunctions.java | 108 ++++++++++++++++++ .../function/scalar/StringFunctionsTest.java | 50 ++++++++ 2 files changed, 158 insertions(+) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java index 374917ec993..31baeb5d2d4 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java @@ -18,6 +18,8 @@ */ package org.apache.pinot.common.function.scalar; +import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet; +import it.unimi.dsi.fastutil.objects.ObjectSet; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; @@ -28,6 +30,7 @@ import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; +import javax.annotation.Nullable; import org.apache.commons.lang3.StringUtils; import org.apache.pinot.common.utils.RegexpPatternConverterUtils; import org.apache.pinot.spi.annotations.ScalarFunction; @@ -580,6 +583,111 @@ public static String[] split(String input, String delimiter, int limit) { return StringUtils.splitByWholeSeparator(input, delimiter, limit); } + /** + * @param input an input string for prefix strings generations. + * @param maxlength the max length of the prefix strings for the string. + * @return generate an array of prefix strings of the string that are shorter than the specified length. + */ + @ScalarFunction + public static String[] prefixes(String input, int maxlength) { + int arrLength = Math.min(maxlength, input.length()); + String[] prefixArr = new String[arrLength]; + for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) { + prefixArr[prefixIdx - 1] = input.substring(0, prefixIdx); + } + return prefixArr; + } + + /** + * @param input an input string for prefix strings generations. + * @param maxlength the max length of the prefix strings for the string. + * @param prefix the prefix to be prepended to prefix strings generated. e.g. '^' for regex matching + * @return generate an array of prefix matchers of the string that are shorter than the specified length. + */ + @ScalarFunction(nullableParameters = true, names = {"prefixesWithPrefix", "prefixes_with_prefix"}) + public static String[] prefixesWithPrefix(String input, int maxlength, @Nullable String prefix) { + if (prefix == null) { + return prefixes(input, maxlength); + } + int arrLength = Math.min(maxlength, input.length()); + String[] prefixArr = new String[arrLength]; + for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) { + prefixArr[prefixIdx - 1] = prefix + input.substring(0, prefixIdx); + } + return prefixArr; + } + + /** + * @param input an input string for suffix strings generations. + * @param maxlength the max length of the suffix strings for the string. + * @return generate an array of suffix strings of the string that are shorter than the specified length. + */ + @ScalarFunction + public static String[] suffixes(String input, int maxlength) { + int arrLength = Math.min(maxlength, input.length()); + String[] suffixArr = new String[arrLength]; + for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) { + suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx); + } + return suffixArr; + } + + /** + * @param input an input string for suffix strings generations. + * @param maxlength the max length of the suffix strings for the string. + * @param suffix the suffix string to be appended for suffix strings generated. e.g. '$' for regex matching. + * @return generate an array of suffix matchers of the string that are shorter than the specified length. + */ + @ScalarFunction(nullableParameters = true, names = {"suffixesWithSuffix", "suffixes_with_suffix"}) + public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable String suffix) { + if (suffix == null) { + return suffixes(input, maxlength); + } + int arrLength = Math.min(maxlength, input.length()); + String[] suffixArr = new String[arrLength]; + for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) { + suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx) + suffix; + } + return suffixArr; + } + + /** + * @param input an input string for ngram generations. + * @param length the max length of the ngram for the string. + * @return generate an array of unique ngram of the string that length are exactly matching the specified length. + */ + @ScalarFunction + public static String[] uniqueNgrams(String input, int length) { + if (length == 0 || length > input.length()) { + return new String[0]; + } + ObjectSet ngramSet = new ObjectLinkedOpenHashSet<>(); + for (int i = 0; i < input.length() - length + 1; i++) { + ngramSet.add(input.substring(i, i + length)); + } + return ngramSet.toArray(new String[0]); + } + + /** + * @param input an input string for ngram generations. + * @param minGram the min length of the ngram for the string. + * @param maxGram the max length of the ngram for the string. + * @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram]. + */ + @ScalarFunction + public static String[] uniqueNgrams(String input, int minGram, int maxGram) { + ObjectSet ngramSet = new ObjectLinkedOpenHashSet<>(); + for (int n = minGram; n <= maxGram && n <= input.length(); n++) { + if (n == 0) { + continue; + } + for (int i = 0; i < input.length() - n + 1; i++) { + ngramSet.add(input.substring(i, i + n)); + } + } + return ngramSet.toArray(new String[0]); + } + /** * TODO: Revisit if index should be one-based (both Presto and Postgres use one-based index, which starts with 1) * @param input diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java index d75b8ada435..6c9fa465f54 100644 --- a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java @@ -77,6 +77,41 @@ public static Object[][] isJsonTestCases() { }; } + @DataProvider(name = "prefixAndSuffixTestCases") + public static Object[][] prefixAndSuffixTestCases() { + return new Object[][]{ + {"abcde", 3, new String[]{"a", "ab", "abc"}, new String[]{"e", "de", "cde"}, new String[]{ + "^a", "^ab", "^abc"}, new String[]{"e$", "de$", "cde$"}}, + {"abcde", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}}, + {"abcde", 9, new String[]{"a", "ab", "abc", "abcd", "abcde"}, new String[]{"e", "de", "cde", "bcde", "abcde"}, + new String[]{"^a", "^ab", "^abc", "^abcd", "^abcde"}, new String[]{"e$", "de$", "cde$", "bcde$", "abcde$"}}, + {"a", 3, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}}, + {"a", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}}, + {"a", 9, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}}, + {"", 3, new String[]{}, new String[]{}, new String[]{}, new String[]{}}, + {"", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}}, + {"", 9, new String[]{}, new String[]{}, new String[]{}, new String[]{}} + }; + } + + @DataProvider(name = "ngramTestCases") + public static Object[][] ngramTestCases() { + return new Object[][]{ + {"abcd", 0, 3, new String[]{"abc", "bcd"}, new String[]{"a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"}}, + {"abcd", 2, 2, new String[]{"ab", "bc", "cd"}, new String[]{"ab", "bc", "cd"}}, + {"abcd", 3, 0, new String[]{}, new String[]{}}, + {"abc", 0, 3, new String[]{"abc"}, new String[]{"a", "b", "c", "ab", "bc", "abc"}}, + {"abc", 3, 0, new String[]{}, new String[]{}}, + {"abc", 3, 3, new String[]{"abc"}, new String[]{"abc"}}, + {"a", 0, 3, new String[]{}, new String[]{"a"}}, + {"a", 2, 3, new String[]{}, new String[]{}}, + {"a", 3, 3, new String[]{}, new String[]{}}, + {"", 3, 0, new String[]{}, new String[]{}}, + {"", 3, 3, new String[]{}, new String[]{}}, + {"", 0, 3, new String[]{}, new String[]{}} + }; + } + @Test(dataProvider = "isJson") public void testIsJson(String input, boolean expectedValue) { assertEquals(StringFunctions.isJson(input), expectedValue); @@ -88,4 +123,19 @@ public void testSplitPart(String input, String delimiter, int index, int limit, assertEquals(StringFunctions.splitPart(input, delimiter, index), expectedToken); assertEquals(StringFunctions.splitPart(input, delimiter, limit, index), expectedTokenWithLimitCounts); } + + @Test(dataProvider = "prefixAndSuffixTestCases") + public void testPrefixAndSuffix(String input, int length, String[] expectedPrefix, String[] expectedSuffix, + String[] expectedPrefixWithRegexChar, String[] expectedSuffixWithRegexChar) { + assertEquals(StringFunctions.prefixes(input, length), expectedPrefix); + assertEquals(StringFunctions.suffixes(input, length), expectedSuffix); + assertEquals(StringFunctions.prefixesWithPrefix(input, length, "^"), expectedPrefixWithRegexChar); + assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"), expectedSuffixWithRegexChar); + } + + @Test(dataProvider = "ngramTestCases") + public void testNGram(String input, int minGram, int maxGram, String[] expectedExactNGram, String[] expectedNGram) { + assertEquals(StringFunctions.uniqueNgrams(input, maxGram), expectedExactNGram); + assertEquals(StringFunctions.uniqueNgrams(input, minGram, maxGram), expectedNGram); + } }