Skip to content

Commit

Permalink
[improvement][chat] Add threshold judgment to field replacement (#1850)
Browse files Browse the repository at this point in the history
  • Loading branch information
lexluo09 authored Oct 28, 2024
1 parent c07b64d commit 920c6e2
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 78 deletions.
6 changes: 6 additions & 0 deletions common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,12 @@
<groupId>org.codehaus.woodstox</groupId>
<artifactId>stax2-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<version>${mockito-inline.version}</version>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package com.tencent.supersonic.common.jsqlparser;

public class EditDistanceUtils {

public static double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
}

public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}

for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
}
}
return dp[m][n];
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;

import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.ExpressionVisitorAdapter;
import net.sf.jsqlparser.expression.Function;
Expand All @@ -9,7 +10,6 @@

@Slf4j
public class FieldReplaceVisitor extends ExpressionVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private ThreadLocal<Boolean> exactReplace = ThreadLocal.withInitial(() -> false);

Expand All @@ -20,7 +20,8 @@ public FieldReplaceVisitor(Map<String, String> fieldNameMap, boolean exactReplac

@Override
public void visit(Column column) {
parseVisitorHelper.replaceColumn(column, fieldNameMap, exactReplace.get());
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
replaceService.replaceColumn(column, fieldNameMap, exactReplace.get());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;

import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.DoubleValue;
import net.sf.jsqlparser.expression.Expression;
Expand Down Expand Up @@ -27,7 +28,6 @@
@Slf4j
public class FieldValueReplaceVisitor extends ExpressionVisitorAdapter {

ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private boolean exactReplace;
private Map<String, Map<String, String>> filedNameToValueMap;

Expand Down Expand Up @@ -138,7 +138,8 @@ public <T extends Expression> void replaceComparisonExpression(T expression) {
private String getReplaceValue(Map<String, String> valueMap, String beforeValue) {
String afterValue = valueMap.get(String.valueOf(beforeValue));
if (StringUtils.isEmpty(afterValue) && !exactReplace) {
return parseVisitorHelper.getReplaceValue(beforeValue, valueMap, false);
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
return replaceService.getReplaceValue(beforeValue, valueMap, false);
}
return afterValue;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;

import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
Expand All @@ -14,7 +15,6 @@
@Slf4j
public class GroupByReplaceVisitor implements GroupByVisitor {

ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;

Expand All @@ -34,10 +34,11 @@ public void visit(GroupByElement groupByElement) {
}

private void replaceExpression(Expression expression) {
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
} else if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;

import com.tencent.supersonic.common.util.ContextUtils;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.schema.Column;
Expand All @@ -9,8 +10,6 @@
import java.util.Map;

public class OrderByReplaceVisitor extends OrderByVisitorAdapter {

ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;

Expand All @@ -22,11 +21,12 @@ public OrderByReplaceVisitor(Map<String, String> fieldNameMap, boolean exactRepl
@Override
public void visit(OrderByElement orderBy) {
Expression expression = orderBy.getExpression();
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
}
if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
super.visit(orderBy);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
package com.tencent.supersonic.common.jsqlparser;

import com.tencent.supersonic.common.util.StringUtil;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.expression.operators.relational.ExpressionList;
import net.sf.jsqlparser.schema.Column;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.stream.Collectors;

@Slf4j
public class ParseVisitorHelper {
@Service
@Data
public class ReplaceService {

@Value("${s2.replace.column.threshold:0.4}")
private double replaceColumnThreshold;

public void replaceFunction(Function expression, Map<String, String> fieldNameMap,
boolean exactReplace) {
Expand All @@ -38,57 +46,28 @@ public void replaceColumn(Column column, Map<String, String> fieldNameMap,

public String getReplaceValue(String beforeValue, Map<String, String> valueMap,
boolean exactReplace) {
String value = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(value)) {
return value;
String replaceValue = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(replaceValue)) {
return replaceValue;
}
if (exactReplace) {
return null;
}
Optional<Entry<String, String>> first = valueMap.entrySet().stream().sorted((k1, k2) -> {
String k1Value = k1.getKey();
String k2Value = k2.getKey();
Double k1Similarity = getSimilarity(beforeValue, k1Value);
Double k2Similarity = getSimilarity(beforeValue, k2Value);
Double k1Similarity = EditDistanceUtils.getSimilarity(beforeValue, k1Value);
Double k2Similarity = EditDistanceUtils.getSimilarity(beforeValue, k2Value);
return k2Similarity.compareTo(k1Similarity);
}).collect(Collectors.toList()).stream().findFirst();

if (first.isPresent()) {
return first.get().getValue();
}
return beforeValue;
}

public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}

for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
replaceValue = first.get().getValue();
double similarity = EditDistanceUtils.getSimilarity(beforeValue, replaceValue);
if (similarity > replaceColumnThreshold) {
return replaceValue;
}
}
return dp[m][n];
}

public double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
return beforeValue;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ void testReplaceFields1() {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' AND song_publis_date = '2023-08-01'"
+ " ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' AND "
+ "歌曲发布时 = '2023-08-01' ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}

Expand Down Expand Up @@ -77,9 +77,9 @@ void testReplaceFields5() {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 "
+ "WHERE YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14' "
+ "GROUP BY YEAR(publish_date)", replaceSql);
Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 WHERE "
+ "YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14' GROUP BY YEAR(发行日期)",
replaceSql);
}

@Test
Expand All @@ -91,9 +91,10 @@ void testReplaceFields6() {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 "
+ "WHERE YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14'"
+ " GROUP BY publish_date", replaceSql);
Assert.assertEquals(
"SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 WHERE YEAR(发行日期) "
+ "IN (2022, 2023) AND sys_imp_date = '2023-08-14' GROUP BY 发行日期",
replaceSql);

}

Expand All @@ -107,9 +108,8 @@ void testReplaceFields7() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2022-08-11' "
+ "AND publish_date <= '2023-08-11') AND play_count > 1000000 AND "
+ "(sys_imp_date >= '2023-07-12' AND sys_imp_date <= '2023-08-11')",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2022-08-11' AND publish_date <= '2023-08-11')"
+ " AND 结算播放量 > 1000000 AND (sys_imp_date >= '2023-07-12' AND sys_imp_date <= '2023-08-11')",
replaceSql);
}

Expand All @@ -123,8 +123,9 @@ void testReplaceFields8() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY "
+ "播放量 DESC LIMIT 11",
replaceSql);
}

Expand All @@ -138,8 +139,9 @@ void testReplaceFields9() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-01-01' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-01-01' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' "
+ "ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}

Expand All @@ -153,8 +155,9 @@ void testReplaceFields10() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-02-09' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-02-09' AND publish_date <="
+ " '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' "
+ "ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}

Expand All @@ -167,9 +170,9 @@ void testReplaceField11() {
fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
replaceSql = SqlRemoveHelper.removeNumberFilter(replaceSql);
Assert.assertEquals("SELECT song_name FROM 歌曲库 WHERE publish_date <= '2023-02-09' AND"
+ " singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09'"
+ " ORDER BY play_count DESC LIMIT 11", replaceSql);
Assert.assertEquals("SELECT song_name FROM 歌曲库 WHERE publish_date <= '2023-02-09' "
+ "AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}

@Test
Expand Down Expand Up @@ -222,9 +225,8 @@ void testReplaceFields15() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT song_name, sum(评分) FROM CSpider WHERE (1 < 2) AND "
+ "sys_imp_date = '2023-10-15' GROUP BY song_name HAVING "
+ "sum(评分) < (SELECT min(评分) FROM CSpider WHERE user_id = '英文')",
"SELECT 歌曲名称, sum(评分) FROM CSpider WHERE (1 < 2) AND sys_imp_date = '2023-10-15' "
+ "GROUP BY 歌曲名称 HAVING sum(评分) < (SELECT min(评分) FROM CSpider WHERE 语种 = '英文')",
replaceSql);
}

Expand All @@ -239,9 +241,9 @@ void testReplaceFields16() {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);

Assert.assertEquals(
"SELECT sum(评分) / (SELECT sum(评分) FROM CSpider WHERE sys_imp_date = '2023-10-15') "
+ "FROM CSpider WHERE sys_imp_date = '2023-10-15' GROUP BY song_name HAVING "
+ "sum(评分) < (SELECT min(评分) FROM CSpider WHERE user_id = '英文')",
"SELECT sum(评分) / (SELECT sum(评分) FROM CSpider WHERE sys_imp_date = '2023-10-15') FROM "
+ "CSpider WHERE sys_imp_date = '2023-10-15' GROUP BY 歌曲名称 HAVING sum(评分) < (SELECT min(评分) "
+ "FROM CSpider WHERE 语种 = '英文')",
replaceSql);
}

Expand Down
Loading

0 comments on commit 920c6e2

Please sign in to comment.