Last active
April 16, 2024 18:48
-
-
Save akumaburn/14c37bf8ed1750bebf313505bad018e3 to your computer and use it in GitHub Desktop.
Bloomfilters for string similarity comparison, much faster than Damerau Levenshtein method
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public class JavaUtil { | |
| // This implementation is much faster than Levenshtein method in my testing. | |
| // These variables will determine the probability of a false-positive | |
| // Set as needed | |
| private static final int BLOOM_FILTER_SIZE = 1024; | |
| private static final int NUM_HASH_FUNCTIONS = 7; | |
| // Note this function will only check the length of "match" (checking the end of "text", adjust as is needed) | |
| public static boolean meetSimilarity(String text, String match, float simPercent) { | |
| if (text == null || match == null) { | |
| throw new IllegalArgumentException("Input strings cannot be null"); | |
| } | |
| if (simPercent < 0 || simPercent > 1) { | |
| throw new IllegalArgumentException("Similarity percentage should be between 0 and 1"); | |
| } | |
| String s1 = text.trim().replaceAll("\\s+", ""); | |
| String s2 = match.trim().replaceAll("\\s+", ""); | |
| if(s1.length() > s2.length()) { | |
| s1 = s1.substring(s1.length() - s2.length()); | |
| } | |
| // Create Bloom Filters for s1 and s2 | |
| BitSet s1BloomFilter = createBloomFilter(s1); | |
| BitSet s2BloomFilter = createBloomFilter(s2); | |
| // Calculate the similarity using Jaccard index (intersection over union) | |
| BitSet intersection = (BitSet) s1BloomFilter.clone(); | |
| intersection.and(s2BloomFilter); | |
| BitSet union = (BitSet) s1BloomFilter.clone(); | |
| union.or(s2BloomFilter); | |
| float similarity = (float) intersection.cardinality() / union.cardinality(); | |
| return similarity >= simPercent; | |
| } | |
| private static BitSet createBloomFilter(String str) { | |
| BitSet bloomFilter = new BitSet(BLOOM_FILTER_SIZE); | |
| for (int i = 0; i < str.length(); i++) { | |
| for (int j = 0; j < NUM_HASH_FUNCTIONS; j++) { | |
| int hash = hash(str, i, j); | |
| bloomFilter.set(Math.abs(hash) % BLOOM_FILTER_SIZE); | |
| } | |
| } | |
| return bloomFilter; | |
| } | |
| private static int hash(String str, int i, int j) { | |
| return (str.charAt(i) * (j + 1)) % (Integer.MAX_VALUE / BLOOM_FILTER_SIZE); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment