优化版编辑距离算法
public static int minDistance(String str, String str1) {
int len = str.length(), len1 = str1.length();
// 预处理
char[] arr1 = str.toCharArray();
char[] arr2 = str1.toCharArray();
// 动态规划数组
int[] dp = new int[len1 + 1];
for (int j = 0; j <= len1; j++) dp[j] = j;
for (int i = 1; i <= len; i++) {
int prevDiagonal = dp[0];
dp[0] = i;
char c1 = arr1[i-1];
for (int j = 1; j <= len1; j++) {
int cost = (c1 == arr2[j-1]) ? 0 : 1;
int newVal = dp[j] + 1;
newVal = newVal < dp[j-1] + 1 ? newVal : dp[j-1] + 1;
newVal = newVal < prevDiagonal + cost ? newVal : prevDiagonal + cost;
prevDiagonal = dp[j];
dp[j] = newVal;
}
}
return dp[len1];
}
ES自定义评分脚本
## 库名称:zhCNenUS-基建化工-油气-木木有限公司23456crm-QM、zhCNenUS-汽车-汽车-木木有限公司23456crm-QM
POST /tm/_search
{
"query": {
"bool": {
"filter": [
{ "terms": { "dbId": ["101476","110316"] }},
{ "match": {
"original": {
"query": "姓名:上云测试用户01",
"minimum_should_match": "69%"
}
}}
],
"must": [
{ "function_score": {
"functions": [
{ "script_score": {
"script": {
"source": """
// 定义相似度计算函数(放于脚本顶部,str1是请求参数)
double calculate(String str, String str1) {
// 快速返回条件
if (str == null || str1 == null || str.isEmpty() || str1.isEmpty()) return 10.0;
int len = str.length(), len1 = str1.length();
int minLen = len < len1 ? len : len1;
int maxLen = len < len1 ? len1 : len;
double ratio = (double)minLen / maxLen;
if(ratio < 0.7) {
return 30;
}
String strLower = str.toLowerCase();
String str1Lower = str1.toLowerCase();
if(str.equals(str1)){
return 100;
}
if(strLower.equals(str1Lower)){
return 90;
}
// 预处理
char[] arr1 = strLower.toCharArray();
char[] arr2 = str1Lower.toCharArray();
// 动态规划数组
int[] dp = new int[len1 + 1];
for (int j = 0; j <= len1; j++) dp[j] = j;
for (int i = 1; i <= len; i++) {
int prevDiagonal = dp[0];
dp[0] = i;
char c1 = arr1[i-1];
for (int j = 1; j <= len1; j++) {
int cost = (c1 == arr2[j-1]) ? 0 : 1;
// 取最小值
int newVal = dp[j] + 1;
newVal = newVal < dp[j-1] + 1 ? newVal : dp[j-1] + 1;
newVal = newVal < prevDiagonal + cost ? newVal : prevDiagonal + cost;
prevDiagonal = dp[j];
dp[j] = newVal;
}
}
return 80.0 * (1.0 / dp[len1]);
}
String es1 = doc['original.keyword'].value;
String es2 = doc['translation.keyword'].value;
String str1 = params.val1;
String str2 = params.val2;
// 计算纯文本的评分
double textRatio = calculate(es1, str1);
// 计算标签格式的评分
double tagRatio = calculate(es2, str2);
// 如果原文没有标签,语料有标签,则需要扣分
if(str2.length()==0 && es2.length()!=0) {
tagRatio=-tagRatio;
}
// 根据情况调整权重
return textRatio + 0.1 * tagRatio;
""",
"params": {
"val1": "姓名:上云测试用户01",
"val2": "<1>4<2/>4</1>"
}
}
}}
],
"boost_mode": "replace"
}}
]
}
}
}