Skip to content

Instantly share code, notes, and snippets.

@hagen3000
Forked from marcan/bloom.py
Created January 22, 2019 15:07
Show Gist options
  • Save hagen3000/b799253b11c38638d5c834ce0dc35edc to your computer and use it in GitHub Desktop.
Save hagen3000/b799253b11c38638d5c834ce0dc35edc to your computer and use it in GitHub Desktop.

Revisions

  1. @marcan marcan revised this gist Mar 2, 2018. 1 changed file with 10 additions and 7 deletions.
    17 changes: 10 additions & 7 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -7,10 +7,10 @@
    # Written to be used with the Have I been pwned? password list:
    # https://haveibeenpwned.com/passwords
    #
    # Download the pre-computed filter here (629MB, k=11, false positive p=0.0005):
    # https://mrcn.st/t/pwned-passwords-1.0u2.bloom
    # Or if you're more paranoid (924MB, k=16, false positive p=0.000015):
    # https://mrcn.st/t/pwned-passwords-1.0u2-k16.bloom
    # Download the pre-computed filter here (968MB, k=11, false positive p=0.0005):
    # https://mrcn.st/t/pwned-passwords-2.0.bloom
    # Or if you're more paranoid (1.41GB, k=16, false positive p=0.000015):
    # https://mrcn.st/t/pwned-passwords-2.0-k16.bloom
    #
    # Also works as a generic file-backed bloom filter for other purposes.
    # Use something like this to work out what 'm' and 'k' you need:
    @@ -40,7 +40,7 @@

    class BloomFilter(object):
    ALIGN = 16384
    THRESHOLD = 2**32 # for uniformity, rehash after fewer bits left
    THRESHOLD_HEADROOM = 2**16 # for uniformity, rehash after fewer bits left

    def __init__(self, filename, m=None, k=None, readonly=False):
    self.bits = None
    @@ -74,13 +74,16 @@ def create(self, filename, m, k):
    self.k = k
    self.size = size
    self.offset = self.ALIGN
    self.threshold = self.THRESHOLD
    self.threshold = 1
    while self.threshold < self.m:
    self.threshold *= 2
    self.threshold *= self.THRESHOLD_HEADROOM
    hdr = {
    "m": m,
    "k": k,
    "offset": self.ALIGN,
    "size": size,
    "threshold": self.THRESHOLD,
    "threshold": self.threshold,
    }
    fd.write(json.dumps(hdr).encode("ascii") + b"\n")
    fd.seek(size + self.ALIGN - 1)
  2. @marcan marcan revised this gist Aug 8, 2017. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -8,9 +8,9 @@
    # https://haveibeenpwned.com/passwords
    #
    # Download the pre-computed filter here (629MB, k=11, false positive p=0.0005):
    # https://mrcn.st/t/pwned-passwords-1.0u1.bloom
    # https://mrcn.st/t/pwned-passwords-1.0u2.bloom
    # Or if you're more paranoid (924MB, k=16, false positive p=0.000015):
    # https://mrcn.st/t/pwned-passwords-1.0u1-k16.bloom
    # https://mrcn.st/t/pwned-passwords-1.0u2-k16.bloom
    #
    # Also works as a generic file-backed bloom filter for other purposes.
    # Use something like this to work out what 'm' and 'k' you need:
  3. @marcan marcan revised this gist Aug 5, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -9,6 +9,8 @@
    #
    # Download the pre-computed filter here (629MB, k=11, false positive p=0.0005):
    # https://mrcn.st/t/pwned-passwords-1.0u1.bloom
    # Or if you're more paranoid (924MB, k=16, false positive p=0.000015):
    # https://mrcn.st/t/pwned-passwords-1.0u1-k16.bloom
    #
    # Also works as a generic file-backed bloom filter for other purposes.
    # Use something like this to work out what 'm' and 'k' you need:
  4. @marcan marcan revised this gist Aug 4, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -13,6 +13,8 @@
    # Also works as a generic file-backed bloom filter for other purposes.
    # Use something like this to work out what 'm' and 'k' you need:
    # https://krisives.github.io/bloom-calculator/
    # For bulk data loads you should put the bloom filter on a tmpfs/ramdisk,
    # as loading directly onto a disk-backed filter is extremely slow.
    #
    # Examples:
    # $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
  5. @marcan marcan revised this gist Aug 4, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion bloom.py
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@
    # Examples:
    # $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
    # $ python bloom.py load -l passwords.bloom pwned-passwords-update-1.txt
    # $ python bloom.py test -s passwords.bloom letmein
    # $ python bloom.py test -s passwords.bloom letmein
    # Found
    # $ python bloom.py test -s passwords.bloom si2v8jX4LG
    # Not found
  6. @marcan marcan revised this gist Aug 4, 2017. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -17,9 +17,9 @@
    # Examples:
    # $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
    # $ python bloom.py load -l passwords.bloom pwned-passwords-update-1.txt
    # $ python bloom.py test -s pwned-passwords-1.0u1.bloom letmein
    # $ python bloom.py test -s passwords.bloom letmein
    # Found
    # $ python bloom.py test -s pwned-passwords-1.0u1.bloom si2v8jX4LG
    # $ python bloom.py test -s passwords.bloom si2v8jX4LG
    # Not found
    #
    # $ python3
  7. @marcan marcan revised this gist Aug 4, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -11,6 +11,8 @@
    # https://mrcn.st/t/pwned-passwords-1.0u1.bloom
    #
    # Also works as a generic file-backed bloom filter for other purposes.
    # Use something like this to work out what 'm' and 'k' you need:
    # https://krisives.github.io/bloom-calculator/
    #
    # Examples:
    # $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
  8. @marcan marcan created this gist Aug 4, 2017.
    191 changes: 191 additions & 0 deletions bloom.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,191 @@
    #!/usr/bin/python3
    #
    # Simple Bloom filter implementation in Python 3
    # Copyright 2017 Hector Martin "marcan" <[email protected]>
    # Licensed under the terms of the MIT license
    #
    # Written to be used with the Have I been pwned? password list:
    # https://haveibeenpwned.com/passwords
    #
    # Download the pre-computed filter here (629MB, k=11, false positive p=0.0005):
    # https://mrcn.st/t/pwned-passwords-1.0u1.bloom
    #
    # Also works as a generic file-backed bloom filter for other purposes.
    #
    # Examples:
    # $ python bloom.py load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
    # $ python bloom.py load -l passwords.bloom pwned-passwords-update-1.txt
    # $ python bloom.py test -s pwned-passwords-1.0u1.bloom letmein
    # Found
    # $ python bloom.py test -s pwned-passwords-1.0u1.bloom si2v8jX4LG
    # Not found
    #
    # $ python3
    # >>> from hashlib import sha1
    # >>> from bloom import BloomFilter
    # >>> filter = bloom.BloomFilter("pwned-passwords-1.0u1.bloom")
    # >>> filter.contains(sha1(b"p4ssword").hexdigest())
    # True
    # >>> filter.contains(sha1(b"super_secure_password").hexdigest())
    # False

    import os, json, mmap
    from hashlib import md5

    class BloomFilter(object):
    ALIGN = 16384
    THRESHOLD = 2**32 # for uniformity, rehash after fewer bits left

    def __init__(self, filename, m=None, k=None, readonly=False):
    self.bits = None

    if os.path.exists(filename):
    self.open(filename, readonly=readonly)
    elif readonly:
    raise IOError("File %s not found" % filename)
    elif m is None or k is None:
    raise ValueError("Filter does not exist and m/k not provided")
    else:
    self.create(filename, m, k)

    def open(self, filename, readonly=True):
    fd = open(filename, "rb" if readonly else "r+b")
    hdr = json.loads(fd.readline().decode("ascii"))
    self.m = hdr["m"]
    self.k = hdr["k"]
    self.size = hdr["size"]
    self.offset = hdr["offset"]
    self.threshold = hdr["threshold"]
    self.bits = mmap.mmap(fd.fileno(), self.size, offset=self.offset,
    prot=(mmap.PROT_READ |
    (mmap.PROT_WRITE if not readonly else 0)))

    def create(self, filename, m, k):
    fd = open(filename, "w+b")
    size = (m + 7) // 8
    size = (size + self.ALIGN - 1) & ~(self.ALIGN - 1)
    self.m = m
    self.k = k
    self.size = size
    self.offset = self.ALIGN
    self.threshold = self.THRESHOLD
    hdr = {
    "m": m,
    "k": k,
    "offset": self.ALIGN,
    "size": size,
    "threshold": self.THRESHOLD,
    }
    fd.write(json.dumps(hdr).encode("ascii") + b"\n")
    fd.seek(size + self.ALIGN - 1)
    fd.write(b"\x00")
    fd.seek(self.ALIGN)

    self.bits = mmap.mmap(fd.fileno(), self.size, offset=self.offset)

    def hash(self, s):
    capacity = 0
    val = 0
    if isinstance(s, str):
    s = s.encode("utf-8")

    for i in range(self.k):
    if capacity < self.threshold:
    s = md5(s).digest()
    val = int.from_bytes(s, byteorder='big')
    capacity = 1 << 128
    h = val % self.m
    val //= self.m
    capacity //= self.m
    yield h

    def add(self, s):
    for h in self.hash(s):
    byte, bit = h >> 3, h & 7
    self.bits[byte] |= 1 << bit

    def update(self, iterable):
    for s in iterable:
    for h in self.hash(s):
    byte, bit = h >> 3, h & 7
    self.bits[byte] |= 1 << bit

    def contains(self, s):
    for h in self.hash(s):
    byte, bit = h >> 3, h & 7
    if not (self.bits[byte] & (1 << bit)):
    return False
    return True

    def sync(self):
    self.bits.flush()

    def __del__(self):
    if self.bits:
    self.bits.flush()
    self.bits.close()

    if __name__ == "__main__":
    import sys, argparse
    from hashlib import sha1

    def cmd_load(args):
    filt = BloomFilter(args.filter, m=args.bits, k=args.hashes)

    if args.sha1:
    if args.lower:
    filt.update(sha1(line.rstrip("\n").lower().encode("utf-8")).hexdigest()
    for line in open(args.input))
    else:
    filt.update(sha1(line.rstrip("\n").encode("utf-8")).hexdigest()
    for line in open(args.input))
    else:
    if args.lower:
    filt.update(line.rstrip("\n").lower().encode("utf-8")
    for line in open(args.input))
    else:
    filt.update(line.rstrip("\n").encode("utf-8")
    for line in open(args.input))

    filt.sync()

    def cmd_test(args):
    value = args.value.encode("utf-8")
    if args.sha1:
    value = sha1(value).hexdigest()

    filt = BloomFilter(args.filter, readonly=True)
    if filt.contains(value):
    print("Found")
    else:
    print("Not found")

    parser = argparse.ArgumentParser(prog=sys.argv[0], epilog="""
    examples:
    %(prog)s load -m 5033164800 -k 11 -l passwords.bloom pwned-passwords-1.0.txt
    %(prog)s load -l passwords.bloom pwned-passwords-update-1.txt
    %(prog)s test -s passwords.bloom letmein
    %(prog)s test -s passwords.bloom si2v8jX4LG""",
    formatter_class=argparse.RawDescriptionHelpFormatter)
    subparsers = parser.add_subparsers(dest="subcommand")
    p_load = subparsers.add_parser('load', help='create or add to a bloom filter')
    p_load.set_defaults(func=cmd_load)
    p_load.add_argument("-m", "--bits", type=int, default=5033164800, help="number of bits in the filter")
    p_load.add_argument("-k", "--hashes", type=int, default=11, help="number of hash functions to use")
    p_load.add_argument("-l", "--lower", action="store_true", help="lowercase input data")
    p_load.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
    p_load.add_argument("filter", type=str, help='filename of the filter')
    p_load.add_argument("input", type=str, help='file containing input data')

    p_test = subparsers.add_parser('test', help='check whether a given value matches the bloom filter')
    p_test.set_defaults(func=cmd_test)
    p_test.add_argument("-s", "--sha1", action="store_true", help="SHA-1 input data")
    p_test.add_argument("filter", type=str, help='filename of the filter')
    p_test.add_argument("value", type=str, help='value to look up in the filter')

    args = parser.parse_args()
    if args.subcommand is None:
    parser.print_help()
    sys.exit(0)

    args.func(args)