Skip to content

Instantly share code, notes, and snippets.

@mburst
Created February 3, 2013 05:26
Show Gist options
  • Save mburst/4700640 to your computer and use it in GitHub Desktop.
Save mburst/4700640 to your computer and use it in GitHub Desktop.

Revisions

  1. mburst created this gist Feb 3, 2013.
    63 changes: 63 additions & 0 deletions bloomfilter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    from bitarray import bitarray
    import mmh3

    class BloomFilter:

    def __init__(self, size, hash_count):
    self.size = size
    self.hash_count = hash_count
    self.bit_array = bitarray(size)
    self.bit_array.setall(0)

    def add(self, string):
    for seed in xrange(self.hash_count):
    result = mmh3.hash(string, seed) % self.size
    self.bit_array[result] = 1

    def lookup(self, string):
    for seed in xrange(self.hash_count):
    result = mmh3.hash(string, seed) % self.size
    if self.bit_array[result] == 0:
    return "Nope"
    return "Probably"

    bf = BloomFilter(500000, 7)
    huge = []

    lines = open("/usr/share/dict/american-english").read().splitlines()
    for line in lines:
    bf.add(line)
    huge.append(line)

    import datetime

    start = datetime.datetime.now()
    bf.lookup("google")
    finish = datetime.datetime.now()
    print (finish-start).microseconds

    start = datetime.datetime.now()
    for word in huge:
    if word == "google":
    break
    finish = datetime.datetime.now()
    print (finish-start).microseconds


    print bf.lookup("Max")
    print bf.lookup("mice")
    print bf.lookup("3")


    start = datetime.datetime.now()
    bf.lookup("apple")
    finish = datetime.datetime.now()
    print (finish-start).microseconds


    start = datetime.datetime.now()
    for word in huge:
    if word == "apple":
    break
    finish = datetime.datetime.now()
    print (finish-start).microseconds