Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Prefix, Suffix and Ngram UDFs #12392

Merged
merged 6 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.pinot.common.function.scalar;

import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
Expand All @@ -28,6 +30,7 @@
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.commons.lang3.StringUtils;
import org.apache.pinot.common.utils.RegexpPatternConverterUtils;
import org.apache.pinot.spi.annotations.ScalarFunction;
Expand Down Expand Up @@ -580,6 +583,111 @@ public static String[] split(String input, String delimiter, int limit) {
return StringUtils.splitByWholeSeparator(input, delimiter, limit);
}

/**
* @param input an input string for prefix strings generations.
* @param maxlength the max length of the prefix strings for the string.
* @return generate an array of prefix strings of the string that are shorter than the specified length.
*/
@ScalarFunction
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You want to add alias unique_prefixes, same for other functions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need unique though? The prefixes will always be unique because they all have different length

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sg. i think the reason of unique_prefixes is to reserve prefixes for other purpose or implementations. if no objection, let me use prefixes() then.

public static String[] prefixes(String input, int maxlength) {
int arrLength = Math.min(maxlength, input.length());
String[] prefixArr = new String[arrLength];
for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
prefixArr[prefixIdx - 1] = input.substring(0, prefixIdx);
}
return prefixArr;
}

/**
* @param input an input string for prefix strings generations.
* @param maxlength the max length of the prefix strings for the string.
* @param prefix the prefix to be prepended to prefix strings generated. e.g. '^' for regex matching
* @return generate an array of prefix matchers of the string that are shorter than the specified length.
*/
@ScalarFunction(nullableParameters = true, names = {"prefixesWithPrefix", "prefixes_with_prefix"})
public static String[] prefixesWithPrefix(String input, int maxlength, @Nullable String prefix) {
if (prefix == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to accept null, you want to annotate it as nullableParameters. Please also annotate the parameter to be @Nullable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated, thanks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not addressed ^^
Take a look at ScalarFunction.class. You need to annotate it as nullableParameters

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @Jackie-Jiang for pointer. updated

return prefixes(input, maxlength);
}
int arrLength = Math.min(maxlength, input.length());
String[] prefixArr = new String[arrLength];
for (int prefixIdx = 1; prefixIdx <= arrLength; prefixIdx++) {
prefixArr[prefixIdx - 1] = prefix + input.substring(0, prefixIdx);
}
return prefixArr;
}

/**
* @param input an input string for suffix strings generations.
* @param maxlength the max length of the suffix strings for the string.
* @return generate an array of suffix strings of the string that are shorter than the specified length.
*/
@ScalarFunction
public static String[] suffixes(String input, int maxlength) {
int arrLength = Math.min(maxlength, input.length());
String[] suffixArr = new String[arrLength];
for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx);
}
return suffixArr;
}

/**
* @param input an input string for suffix strings generations.
* @param maxlength the max length of the suffix strings for the string.
* @param suffix the suffix string to be appended for suffix strings generated. e.g. '$' for regex matching.
* @return generate an array of suffix matchers of the string that are shorter than the specified length.
*/
@ScalarFunction(nullableParameters = true, names = {"suffixesWithSuffix", "suffixes_with_suffix"})
public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable String suffix) {
if (suffix == null) {
return suffixes(input, maxlength);
}
int arrLength = Math.min(maxlength, input.length());
String[] suffixArr = new String[arrLength];
for (int suffixIdx = 1; suffixIdx <= arrLength; suffixIdx++) {
suffixArr[suffixIdx - 1] = input.substring(input.length() - suffixIdx) + suffix;
}
return suffixArr;
}

/**
* @param input an input string for ngram generations.
* @param length the max length of the ngram for the string.
* @return generate an array of unique ngram of the string that length are exactly matching the specified length.
*/
@ScalarFunction
public static String[] uniqueNgrams(String input, int length) {
if (length == 0 || length > input.length()) {
return new String[0];
}
ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
for (int i = 0; i < input.length() - length + 1; i++) {
ngramSet.add(input.substring(i, i + length));
}
return ngramSet.toArray(new String[0]);
}

/**
* @param input an input string for ngram generations.
* @param minGram the min length of the ngram for the string.
* @param maxGram the max length of the ngram for the string.
* @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram].
*/
@ScalarFunction
public static String[] uniqueNgrams(String input, int minGram, int maxGram) {
ObjectSet<String> ngramSet = new ObjectLinkedOpenHashSet<>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi @Jackie-Jiang ngrams doesn't guarantee to be unique, right? so the usage of Set is to dedup and avoid duplicates.

for (int n = minGram; n <= maxGram && n <= input.length(); n++) {
if (n == 0) {
continue;
}
for (int i = 0; i < input.length() - n + 1; i++) {
ngramSet.add(input.substring(i, i + n));
}
}
return ngramSet.toArray(new String[0]);
}

/**
* TODO: Revisit if index should be one-based (both Presto and Postgres use one-based index, which starts with 1)
* @param input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,41 @@ public static Object[][] isJsonTestCases() {
};
}

@DataProvider(name = "prefixAndSuffixTestCases")
public static Object[][] prefixAndSuffixTestCases() {
return new Object[][]{
{"abcde", 3, new String[]{"a", "ab", "abc"}, new String[]{"e", "de", "cde"}, new String[]{
"^a", "^ab", "^abc"}, new String[]{"e$", "de$", "cde$"}},
{"abcde", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"abcde", 9, new String[]{"a", "ab", "abc", "abcd", "abcde"}, new String[]{"e", "de", "cde", "bcde", "abcde"},
new String[]{"^a", "^ab", "^abc", "^abcd", "^abcde"}, new String[]{"e$", "de$", "cde$", "bcde$", "abcde$"}},
{"a", 3, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
{"a", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"a", 9, new String[]{"a"}, new String[]{"a"}, new String[]{"^a"}, new String[]{"a$"}},
{"", 3, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"", 0, new String[]{}, new String[]{}, new String[]{}, new String[]{}},
{"", 9, new String[]{}, new String[]{}, new String[]{}, new String[]{}}
};
}

@DataProvider(name = "ngramTestCases")
public static Object[][] ngramTestCases() {
return new Object[][]{
{"abcd", 0, 3, new String[]{"abc", "bcd"}, new String[]{"a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"}},
{"abcd", 2, 2, new String[]{"ab", "bc", "cd"}, new String[]{"ab", "bc", "cd"}},
{"abcd", 3, 0, new String[]{}, new String[]{}},
{"abc", 0, 3, new String[]{"abc"}, new String[]{"a", "b", "c", "ab", "bc", "abc"}},
{"abc", 3, 0, new String[]{}, new String[]{}},
{"abc", 3, 3, new String[]{"abc"}, new String[]{"abc"}},
{"a", 0, 3, new String[]{}, new String[]{"a"}},
{"a", 2, 3, new String[]{}, new String[]{}},
{"a", 3, 3, new String[]{}, new String[]{}},
{"", 3, 0, new String[]{}, new String[]{}},
{"", 3, 3, new String[]{}, new String[]{}},
{"", 0, 3, new String[]{}, new String[]{}}
};
}

@Test(dataProvider = "isJson")
public void testIsJson(String input, boolean expectedValue) {
assertEquals(StringFunctions.isJson(input), expectedValue);
Expand All @@ -88,4 +123,19 @@ public void testSplitPart(String input, String delimiter, int index, int limit,
assertEquals(StringFunctions.splitPart(input, delimiter, index), expectedToken);
assertEquals(StringFunctions.splitPart(input, delimiter, limit, index), expectedTokenWithLimitCounts);
}

@Test(dataProvider = "prefixAndSuffixTestCases")
public void testPrefixAndSuffix(String input, int length, String[] expectedPrefix, String[] expectedSuffix,
String[] expectedPrefixWithRegexChar, String[] expectedSuffixWithRegexChar) {
assertEquals(StringFunctions.prefixes(input, length), expectedPrefix);
assertEquals(StringFunctions.suffixes(input, length), expectedSuffix);
assertEquals(StringFunctions.prefixesWithPrefix(input, length, "^"), expectedPrefixWithRegexChar);
assertEquals(StringFunctions.suffixesWithSuffix(input, length, "$"), expectedSuffixWithRegexChar);
}

@Test(dataProvider = "ngramTestCases")
public void testNGram(String input, int minGram, int maxGram, String[] expectedExactNGram, String[] expectedNGram) {
assertEquals(StringFunctions.uniqueNgrams(input, maxGram), expectedExactNGram);
assertEquals(StringFunctions.uniqueNgrams(input, minGram, maxGram), expectedNGram);
}
}
Loading