发现apache提供了现成的解决方案


java中如何比较两个字符串(java中如何计算两个字符串的相似度)(1)



1.Cosine similarity

package org.apache.commons.text.similarity; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * Measures the Cosine similarity of two vectors of an inner product space and * compares the angle between them. * * <p> * For further explanation about the Cosine Similarity, refer to * http://en.wikipedia.org/wiki/Cosine_similarity. * </p> * * @since 1.0 */ public class CosineSimilarity { /** * Calculates the cosine similarity for two given vectors. * * @param leftVector left vector * @param rightVector right vector * @return cosine similarity between the two vectors */ public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) { if (leftVector == null || rightVector == null) { throw new IllegalArgumentException("Vectors must not be null"); } final Set<CharSequence> intersection = getIntersection(leftVector, rightVector); final double dotProduct = dot(leftVector, rightVector, intersection); double d1 = 0.0d; for (final Integer value : leftVector.values()) { d1 = Math.pow(value, 2); } double d2 = 0.0d; for (final Integer value : rightVector.values()) { d2 = Math.pow(value, 2); } double cosineSimilarity; if (d1 <= 0.0 || d2 <= 0.0) { cosineSimilarity = 0.0; } else { cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2))); } return cosineSimilarity; } /** * Returns a set with strings common to the two given maps. * * @param leftVector left vector map * @param rightVector right vector map * @return common strings */ private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector) { final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet()); intersection.retainAll(rightVector.keySet()); return intersection; } /** * Computes the dot product of two vectors. It ignores remaining elements. It means * that if a vector is longer than other, then a smaller part of it will be used to compute * the dot product. * * @param leftVector left vector * @param rightVector right vector * @param intersection common elements * @return the dot product */ private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector, final Set<CharSequence> intersection) { long dotProduct = 0; for (final CharSequence key : intersection) { dotProduct = leftVector.get(key) * rightVector.get(key); } return dotProduct; } }

2.JaccardSimilarity

package org.apache.commons.text.similarity; import java.util.HashSet; import java.util.Set; /** * Measures the Jaccard similarity (aka Jaccard index) of two sets of character * sequence. Jaccard similarity is the size of the intersection divided by the * size of the union of the two sets. * * <p> * For further explanation about Jaccard Similarity, refer * https://en.wikipedia.org/wiki/Jaccard_index * </p> * * @since 1.0 */ public class JaccardSimilarity implements SimilarityScore<Double> { /** * Calculates Jaccard Similarity of two set character sequence passed as * input. * * @param left first character sequence * @param right second character sequence * @return index * @throws IllegalArgumentException * if either String input {@code null} */ @Override public Double apply(CharSequence left, CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Input cannot be null"); } return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d; } /** * Calculates Jaccard Similarity of two character sequences passed as * input. Does the calculation by identifying the union (characters in at * least one of the two sets) of the two sets and intersection (characters * which are present in set one which are present in set two) * * @param left first character sequence * @param right second character sequence * @return index */ private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) { Set<String> intersectionSet = new HashSet<String>(); Set<String> unionSet = new HashSet<String>(); boolean unionFilled = false; int leftLength = left.length(); int rightLength = right.length(); if (leftLength == 0 || rightLength == 0) { return 0d; } for (int leftIndex = 0; leftIndex < leftLength; leftIndex ) { unionSet.add(String.valueOf(left.charAt(leftIndex))); for (int rightIndex = 0; rightIndex < rightLength; rightIndex ) { if (!unionFilled) { unionSet.add(String.valueOf(right.charAt(rightIndex))); } if (left.charAt(leftIndex) == right.charAt(rightIndex)) { intersectionSet.add(String.valueOf(left.charAt(leftIndex))); } } unionFilled = true; } return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); } }

3.LevenshteinDistance

/** * LevenshteinDistance * copied from https://commons.apache.org/proper/commons-lang/javadocs/api-2.5/src-html/org/apache/commons/lang/StringUtils.html#line.6162 */ public static int getLevenshteinDistance(String s, String t) { if (s == null || t == null) { throw new IllegalArgumentException("Strings must not be null"); } int n = s.length(); // length of s int m = t.length(); // length of t if (n == 0) { return m; } else if (m == 0) { return n; } if (n > m) { // swap the input strings to consume less memory String tmp = s; s = t; t = tmp; n = m; m = t.length(); } int p[] = new int[n 1]; //'previous' cost array, horizontally int d[] = new int[n 1]; // cost array, horizontally int _d[]; //placeholder to assist in swapping p and d // indexes into strings s and t int i; // iterates through s int j; // iterates through t char t_j; // jth character of t int cost; // cost for (i = 0; i <= n; i ) { p[i] = i; } for (j = 1; j <= m; j ) { t_j = t.charAt(j - 1); d[0] = j; for (i = 1; i <= n; i ) { cost = s.charAt(i - 1) == t_j ? 0 : 1; // minimum of cell to the left 1, to the top 1, diagonally left and up cost d[i] = Math.min(Math.min(d[i - 1] 1, p[i] 1), p[i - 1] cost); } // copy current distance counts to 'previous row' distance counts _d = p; p = d; d = _d; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return p[n]; }

4.JaroWinklerDistance

/** * JaroWinklerDistance * Copied from https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWinklerDistance.java.html * apply method changed to similarity */ public static Double similarity(final CharSequence left, final CharSequence right) { final double defaultScalingFactor = 0.1; final double percentageRoundValue = 100.0; if (left == null || right == null) { throw new IllegalArgumentException("Strings must not be null"); } int[] mtp = matches(left, right); double m = mtp[0]; if (m == 0) { return 0D; } double j = ((m / left.length() m / right.length() (m - mtp[1]) / m)) / 3; double jw = j < 0.7D ? j : j Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); return Math.round(jw * percentageRoundValue) / percentageRoundValue; } protected static int[] matches(final CharSequence first, final CharSequence second) { CharSequence max, min; if (first.length() > second.length()) { max = first; min = second; } else { max = second; min = first; } int range = Math.max(max.length() / 2 - 1, 0); int[] matchIndexes = new int[min.length()]; Arrays.fill(matchIndexes, -1); boolean[] matchFlags = new boolean[max.length()]; int matches = 0; for (int mi = 0; mi < min.length(); mi ) { char c1 = min.charAt(mi); for (int xi = Math.max(mi - range, 0), xn = Math.min(mi range 1, max.length()); xi < xn; xi ) { if (!matchFlags[xi] && c1 == max.charAt(xi)) { matchIndexes[mi] = xi; matchFlags[xi] = true; matches ; break; } } } char[] ms1 = new char[matches]; char[] ms2 = new char[matches]; for (int i = 0, si = 0; i < min.length(); i ) { if (matchIndexes[i] != -1) { ms1[si] = min.charAt(i); si ; } } for (int i = 0, si = 0; i < max.length(); i ) { if (matchFlags[i]) { ms2[si] = max.charAt(i); si ; } } int transpositions = 0; for (int mi = 0; mi < ms1.length; mi ) { if (ms1[mi] != ms2[mi]) { transpositions ; } } int prefix = 0; for (int mi = 0; mi < min.length(); mi ) { if (first.charAt(mi) == second.charAt(mi)) { prefix ; } else { break; } } return new int[] { matches, transpositions / 2, prefix, max.length() }; }





参考资料:

【1】https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java

【2】https://zatackcoder.com/java-program-to-check-two-strings-similarity/

【3】https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.source.html

,