Skip to content

Instantly share code, notes, and snippets.

@akumaburn
Last active April 16, 2024 18:48
Show Gist options
  • Select an option

  • Save akumaburn/14c37bf8ed1750bebf313505bad018e3 to your computer and use it in GitHub Desktop.

Select an option

Save akumaburn/14c37bf8ed1750bebf313505bad018e3 to your computer and use it in GitHub Desktop.
Bloomfilters for string similarity comparison, much faster than Damerau Levenshtein method
public class JavaUtil {
// This implementation is much faster than Levenshtein method in my testing.
// These variables will determine the probability of a false-positive
// Set as needed
private static final int BLOOM_FILTER_SIZE = 1024;
private static final int NUM_HASH_FUNCTIONS = 7;
// Note this function will only check the length of "match" (checking the end of "text", adjust as is needed)
public static boolean meetSimilarity(String text, String match, float simPercent) {
if (text == null || match == null) {
throw new IllegalArgumentException("Input strings cannot be null");
}
if (simPercent < 0 || simPercent > 1) {
throw new IllegalArgumentException("Similarity percentage should be between 0 and 1");
}
String s1 = text.trim().replaceAll("\\s+", "");
String s2 = match.trim().replaceAll("\\s+", "");
if(s1.length() > s2.length()) {
s1 = s1.substring(s1.length() - s2.length());
}
// Create Bloom Filters for s1 and s2
BitSet s1BloomFilter = createBloomFilter(s1);
BitSet s2BloomFilter = createBloomFilter(s2);
// Calculate the similarity using Jaccard index (intersection over union)
BitSet intersection = (BitSet) s1BloomFilter.clone();
intersection.and(s2BloomFilter);
BitSet union = (BitSet) s1BloomFilter.clone();
union.or(s2BloomFilter);
float similarity = (float) intersection.cardinality() / union.cardinality();
return similarity >= simPercent;
}
private static BitSet createBloomFilter(String str) {
BitSet bloomFilter = new BitSet(BLOOM_FILTER_SIZE);
for (int i = 0; i < str.length(); i++) {
for (int j = 0; j < NUM_HASH_FUNCTIONS; j++) {
int hash = hash(str, i, j);
bloomFilter.set(Math.abs(hash) % BLOOM_FILTER_SIZE);
}
}
return bloomFilter;
}
private static int hash(String str, int i, int j) {
return (str.charAt(i) * (j + 1)) % (Integer.MAX_VALUE / BLOOM_FILTER_SIZE);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment