https://github.com/tdebatty/java-string-similarity推荐标准莱茵斯坦算法和余弦算法,字符串较大就用余弦,对准确率要求高就莱茵斯坦算法。
附上测试一个测试类自己修改下,用自己的实际数据多跑跑
public class SimilarityAlgorithmTest {
@
Test public void test() {
String str1 = "ares";
String str2 = "kele";
LevenshteinDistance levenshtein = LevenshteinDistance.getDefaultInstance();
System.out.println("Levenshtein distance: " + levenshtein.apply(str1, str2));
System.out.println("Levenshtein distance: " + levenshtein.apply(str2, str2));
LevenshteinDistance levenshteinWithThreshold = new LevenshteinDistance(3);
// Returns -1 since the actual distance, 4, is higher than the threshold
System.out.println("Levenshtein distance: " + levenshteinWithThreshold.apply(str1, str2));
LevenshteinDetailedDistance levenshteinDetailed = LevenshteinDetailedDistance.getDefaultInstance();
System.out.println("Levenshtein detailed distance: " + levenshteinDetailed.apply(str1, str2));
}
@
Test public void testLevenshteinDistance() {
int sLength = 13_162;
int strLength = 4_000;
String s = StringUtil.random(sLength);
String s1 = StringUtil.random(strLength) + s;
String s2 = StringUtil.random(strLength) + s;
int threshold = 4_096;
LevenshteinDistance levenshtein = new LevenshteinDistance(threshold);
long startTime = System.currentTimeMillis();
/*
threshold 为 32766 时花费 7022ms
threshold 为 16384 时花费 2164ms
threshold 为 8192 时花费 463ms
threshold 为 4096 时花费 146ms
threshold 为 2048 时花费 62ms
threshold 为 1024 时花费 41ms
threshold 为 512 时花费 29ms
*/
Integer result = levenshtein.apply(s1, s2);
System.out.println(System.currentTimeMillis() - startTime);
System.out.println("Levenshtein distance: " + result);
/*
这种是截取模式比较靠前的指定字符
threshold 为 4096 时花费 67ms
*/
levenshtein = LevenshteinDistance.getDefaultInstance();
startTime = System.currentTimeMillis();
result = levenshtein.apply(s1.substring(0, threshold), s2.substring(0, threshold));
System.out.println(System.currentTimeMillis() - startTime);
System.out.println("Levenshtein distance: " + result);
System.out.println();
result = levenshtein.apply(s1, s2);
System.out.println("Levenshtein: " + (1.0 - (double) result / (strLength + sLength)));
NormalizedLevenshtein normalizedLevenshtein = new NormalizedLevenshtein();
double similarity = normalizedLevenshtein.similarity(s1, s2);
System.out.println("NormalizedLevenshtein: " + similarity);
Jaccard jaccard = new Jaccard();
similarity = jaccard.similarity(s1, s2);
System.out.println("Jaccard: " + similarity);
Cosine cosine = new Cosine();
similarity = cosine.similarity(s1, s2);
System.out.println("Cosine: " + similarity);
QGram qGram = new QGram();
similarity = qGram.distance(s1, s2);
System.out.println("QGram: " + (1.0 - similarity / (strLength + sLength)));
}
@
Test public void testAlgorithmPerformance() {
String s1 = StringUtil.random(13_162);
String s2 = StringUtil.random(13_162);
LevenshteinDistance levenshtein = LevenshteinDistance.getDefaultInstance();
long startTime = System.currentTimeMillis();
levenshtein.apply(s1, s2);
System.out.println("LevenshteinDistance:" + (System.currentTimeMillis() - startTime));
NormalizedLevenshtein normalizedLevenshtein = new NormalizedLevenshtein();
startTime = System.currentTimeMillis();
normalizedLevenshtein.distance(s1, s2);
System.out.println("NormalizedLevenshtein: " + (System.currentTimeMillis() - startTime));
JaroWinkler jaroWinkler = new JaroWinkler();
startTime = System.currentTimeMillis();
jaroWinkler.distance(s1, s2);
System.out.println("JaroWinkler: " + (System.currentTimeMillis() - startTime));
LongestCommonSubsequence longestCommonSubsequence = new LongestCommonSubsequence();
startTime = System.currentTimeMillis();
longestCommonSubsequence.apply(s1, s2);
System.out.println("LongestCommonSubsequence: " + (System.currentTimeMillis() - startTime));
info.debatty.java.stringsimilarity.MetricLCS metricLCS = new info.debatty.java.stringsimilarity.MetricLCS();
startTime = System.currentTimeMillis();
metricLCS.distance(s1, s2);
System.out.println("MetricLCS: " + (System.currentTimeMillis() - startTime));
QGram qGram = new QGram();
startTime = System.currentTimeMillis();
qGram.distance(s1, s2);
System.out.println("QGram: " + (System.currentTimeMillis() - startTime));
Cosine cosine = new Cosine();
startTime = System.currentTimeMillis();
cosine.distance(s1, s2);
System.out.println("Cosine: " + (System.currentTimeMillis() - startTime));
Jaccard jaccard = new Jaccard();
startTime = System.currentTimeMillis();
jaccard.distance(s1, s2);
System.out.println("Jaccard: " + (System.currentTimeMillis() - startTime));
}
@
Test public void testCosine() {
String s1 = StringUtil.random(1_000_000);
String s2 = StringUtil.random(1_000_000);
Cosine cosine = new Cosine();
long startTime = System.currentTimeMillis();
cosine.distance(s1, s2);
System.out.println("Cosine: " + (System.currentTimeMillis() - startTime));
}
}