namespace BloomFilter { using System; using System.Collections; /// /// Bloom filter. /// /// Item type public class Filter { private readonly int _hashFunctionCount; private readonly BitArray _hashBits; private readonly HashFunction _getHashSecondary; /// /// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. /// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions. /// /// The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected. public Filter(int capacity) : this(capacity, null) { } /// /// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. /// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions. /// /// The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected. /// The accepable false-positive rate (e.g., 0.01F = 1%) public Filter(int capacity, float errorRate) : this(capacity, errorRate, null) { } /// /// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. /// /// The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected. /// The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you. public Filter(int capacity, HashFunction hashFunction) : this(capacity, BestErrorRate(capacity), hashFunction) { } /// /// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. /// /// The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected. /// The accepable false-positive rate (e.g., 0.01F = 1%) /// The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you. public Filter(int capacity, float errorRate, HashFunction hashFunction) : this(capacity, errorRate, hashFunction, BestM(capacity, errorRate), BestK(capacity, errorRate)) { } /// /// Creates a new Bloom filter. /// /// The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected. /// The accepable false-positive rate (e.g., 0.01F = 1%) /// The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you. /// The number of elements in the BitArray. /// The number of hash functions to use. public Filter(int capacity, float errorRate, HashFunction hashFunction, int m, int k) { // validate the params are in range if (capacity < 1) { throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0"); } if (errorRate >= 1 || errorRate <= 0) { throw new ArgumentOutOfRangeException("errorRate", errorRate, string.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate)); } // from overflow in bestM calculation if (m < 1) { throw new ArgumentOutOfRangeException(string.Format("The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate)); } // set the secondary hash function if (hashFunction == null) { if (typeof(T) == typeof(string)) { this._getHashSecondary = HashString; } else if (typeof(T) == typeof(int)) { this._getHashSecondary = HashInt32; } else { throw new ArgumentNullException("hashFunction", "Please provide a hash function for your type T, when T is not a string or int."); } } else { this._getHashSecondary = hashFunction; } this._hashFunctionCount = k; this._hashBits = new BitArray(m); } /// /// A function that can be used to hash input. /// /// The values to be hashed. /// The resulting hash code. public delegate int HashFunction(T input); /// /// The ratio of false to true bits in the filter. E.g., 1 true bit in a 10 bit filter means a truthiness of 0.1. /// public double Truthiness { get { return (double)this.TrueBits() / this._hashBits.Count; } } /// /// Adds a new item to the filter. It cannot be removed. /// /// The item. public void Add(T item) { // start flipping bits for each hash of item int primaryHash = item.GetHashCode(); int secondaryHash = this._getHashSecondary(item); for (int i = 0; i < this._hashFunctionCount; i++) { int hash = this.ComputeHash(primaryHash, secondaryHash, i); this._hashBits[hash] = true; } } /// /// Checks for the existance of the item in the filter for a given probability. /// /// The item. /// The . public bool Contains(T item) { int primaryHash = item.GetHashCode(); int secondaryHash = this._getHashSecondary(item); for (int i = 0; i < this._hashFunctionCount; i++) { int hash = this.ComputeHash(primaryHash, secondaryHash, i); if (this._hashBits[hash] == false) { return false; } } return true; } /// /// The best k. /// /// The capacity. /// The error rate. /// The . private static int BestK(int capacity, float errorRate) { return (int)Math.Round(Math.Log(2.0) * BestM(capacity, errorRate) / capacity); } /// /// The best m. /// /// The capacity. /// The error rate. /// The . private static int BestM(int capacity, float errorRate) { return (int)Math.Ceiling(capacity * Math.Log(errorRate, (1.0 / Math.Pow(2, Math.Log(2.0))))); } /// /// The best error rate. /// /// The capacity. /// The . private static float BestErrorRate(int capacity) { float c = (float)(1.0 / capacity); if (c != 0) { return c; } // default // http://www.cs.princeton.edu/courses/archive/spring02/cs493/lec7.pdf return (float)Math.Pow(0.6185, int.MaxValue / capacity); } /// /// Hashes a 32-bit signed int using Thomas Wang's method v3.1 (http://www.concentric.net/~Ttwang/tech/inthash.htm). /// Runtime is suggested to be 11 cycles. /// /// The integer to hash. /// The hashed result. private static int HashInt32(T input) { uint? x = input as uint?; unchecked { x = ~x + (x << 15); // x = (x << 15) - x- 1, as (~x) + y is equivalent to y - x - 1 in two's complement representation x = x ^ (x >> 12); x = x + (x << 2); x = x ^ (x >> 4); x = x * 2057; // x = (x + (x << 3)) + (x<< 11); x = x ^ (x >> 16); return (int)x; } } /// /// Hashes a string using Bob Jenkin's "One At A Time" method from Dr. Dobbs (http://burtleburtle.net/bob/hash/doobs.html). /// Runtime is suggested to be 9x+9, where x = input.Length. /// /// The string to hash. /// The hashed result. private static int HashString(T input) { string s = input as string; int hash = 0; for (int i = 0; i < s.Length; i++) { hash += s[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } /// /// The true bits. /// /// The . private int TrueBits() { int output = 0; foreach (bool bit in this._hashBits) { if (bit == true) { output++; } } return output; } /// /// Performs Dillinger and Manolios double hashing. /// /// The primary hash. /// The secondary hash. /// The i. /// The . private int ComputeHash(int primaryHash, int secondaryHash, int i) { int resultingHash = (primaryHash + (i * secondaryHash)) % this._hashBits.Count; return Math.Abs((int)resultingHash); } } }