Skip to content

Instantly share code, notes, and snippets.

@abhishektiwari
Forked from josephkern/bloom.py
Created April 17, 2017 11:39
Show Gist options
  • Save abhishektiwari/de1b0a38d1a66cb0eb5dfa94033730b8 to your computer and use it in GitHub Desktop.
Save abhishektiwari/de1b0a38d1a66cb0eb5dfa94033730b8 to your computer and use it in GitHub Desktop.

Revisions

  1. @josephkern josephkern revised this gist Jun 8, 2012. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -54,18 +54,18 @@ def add(self, value):
    # by the (digest / 8) digest is described above in self._hash()
    # Bitwise OR is undertaken on the value at the location and
    # 2 to the power of digest modulo 8. Ex: 2 ** (30034 % 8)
    # to garuntee the value is <= 128, the bytearray not being able
    # to grantee the value is <= 128, the bytearray not being able
    # to store a value >= 256. Q: Why not use ((modulo 9) -1) then?
    self.filter[(digest / 8)] |= (2 ** (digest % 8))
    # The purpose here is to spread out the hashes to create a uniqe
    # "fingerprint" with uniqe locations in the filter array,
    # The purpose here is to spread out the hashes to create a unique
    # "fingerprint" with unique locations in the filter array,
    # rather than just a big long hash blob.

    def query(self, value):
    """Bitwise AND to query values in self.filter
    Expects:
    value: value to check filter against (assumed int())"""
    # If all() hashes return True from a btiwise AND (the opposite
    # If all() hashes return True from a bitwise AND (the opposite
    # described above in self.add()) for each digest returned from
    # self._hash return True, else False
    return all(self.filter[(digest / 8)] & (2 ** (digest % 8))
  2. @josephkern josephkern created this gist Jun 8, 2012.
    85 changes: 85 additions & 0 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,85 @@
    #!/usr/bin/env python
    """A simple Bloom Filter implementation
    Calculating optimal filter size:
    Where:
    m is: self.bitcount (how many bits in self.filter)
    n is: the number of expected values added to self.filter
    k is: the number of hashes being produced
    (1 - math.exp(-float(k * n) / m)) ** k
    http://en.wikipedia.org/wiki/Bloom_filter
    """

    from hashlib import sha256

    class BloomFilter(object):
    """A simple bloom filter for lots of int()"""

    def __init__(self, array_size=(1 * 1024), hashes=13):
    """Initializes a BloomFilter() object:
    Expects:
    array_size (in bytes): 4 * 1024 for a 4KB filter
    hashes (int): for the number of hashes to perform"""

    self.filter = bytearray(array_size) # The filter itself
    self.bitcount = array_size * 8 # Bits in the filter
    self.hashes = hashes # The number of hashes to use

    def _hash(self, value):
    """Creates a hash of an int and yields a generator of hash functions
    Expects:
    value: int()
    Yields:
    generator of ints()"""

    # Build an int() around the sha256 digest of int() -> value
    value = value.__str__() # Comment out line if you're filtering strings()
    digest = int(sha256(value).hexdigest(), 16)
    for _ in range(self.hashes):
    # bitwise AND of the digest and all of the available bit positions
    # in the filter
    yield digest & (self.bitcount - 1)
    # Shift bits in digest to the right, based on 256 (in sha256)
    # divided by the number of hashes needed be produced.
    # Rounding the result by using int().
    # So: digest >>= (256 / 13) would shift 19 bits to the right.
    digest >>= (256 / self.hashes)

    def add(self, value):
    """Bitwise OR to add value(s) into the self.filter
    Expects:
    value: generator of digest ints()
    """
    for digest in self._hash(value):
    # In-place bitwise OR of the filter, position is determined
    # by the (digest / 8) digest is described above in self._hash()
    # Bitwise OR is undertaken on the value at the location and
    # 2 to the power of digest modulo 8. Ex: 2 ** (30034 % 8)
    # to garuntee the value is <= 128, the bytearray not being able
    # to store a value >= 256. Q: Why not use ((modulo 9) -1) then?
    self.filter[(digest / 8)] |= (2 ** (digest % 8))
    # The purpose here is to spread out the hashes to create a uniqe
    # "fingerprint" with uniqe locations in the filter array,
    # rather than just a big long hash blob.

    def query(self, value):
    """Bitwise AND to query values in self.filter
    Expects:
    value: value to check filter against (assumed int())"""
    # If all() hashes return True from a btiwise AND (the opposite
    # described above in self.add()) for each digest returned from
    # self._hash return True, else False
    return all(self.filter[(digest / 8)] & (2 ** (digest % 8))
    for digest in self._hash(value))


    if __name__ == "__main__":
    bf = BloomFilter()

    bf.add(30000)
    bf.add(1230213)
    bf.add(1)

    print("Filter size {0} bytes").format(bf.filter.__sizeof__())
    print bf.query(1) # True
    print bf.query(1230213) # True
    print bf.query(12) # False