public class JavaUtil { // This implementation is much faster than Levenshtein method in my testing. // These variables will determine the probability of a false-positive // Set as needed private static final int BLOOM_FILTER_SIZE = 1024; private static final int NUM_HASH_FUNCTIONS = 7; // Note this function will only check the length of "match" (checking the end of "text", adjust as is needed) public static boolean meetSimilarity(String text, String match, float simPercent) { if (text == null || match == null) { throw new IllegalArgumentException("Input strings cannot be null"); } if (simPercent < 0 || simPercent > 1) { throw new IllegalArgumentException("Similarity percentage should be between 0 and 1"); } String s1 = text.trim().replaceAll("\\s+", ""); String s2 = match.trim().replaceAll("\\s+", ""); if(s1.length() > s2.length()) { s1 = s1.substring(s1.length() - s2.length()); } // Create Bloom Filters for s1 and s2 BitSet s1BloomFilter = createBloomFilter(s1); BitSet s2BloomFilter = createBloomFilter(s2); // Calculate the similarity using Jaccard index (intersection over union) BitSet intersection = (BitSet) s1BloomFilter.clone(); intersection.and(s2BloomFilter); BitSet union = (BitSet) s1BloomFilter.clone(); union.or(s2BloomFilter); float similarity = (float) intersection.cardinality() / union.cardinality(); return similarity >= simPercent; } private static BitSet createBloomFilter(String str) { BitSet bloomFilter = new BitSet(BLOOM_FILTER_SIZE); for (int i = 0; i < str.length(); i++) { for (int j = 0; j < NUM_HASH_FUNCTIONS; j++) { int hash = hash(str, i, j); bloomFilter.set(Math.abs(hash) % BLOOM_FILTER_SIZE); } } return bloomFilter; } private static int hash(String str, int i, int j) { return (str.charAt(i) * (j + 1)) % (Integer.MAX_VALUE / BLOOM_FILTER_SIZE); } }