Skip to content

Instantly share code, notes, and snippets.

@xtao
Forked from tarekziade/distribution.py
Created June 4, 2016 05:39
Show Gist options
  • Save xtao/b2e918c7d0dc092bbab24637ef7ae6bc to your computer and use it in GitHub Desktop.
Save xtao/b2e918c7d0dc092bbab24637ef7ae6bc to your computer and use it in GitHub Desktop.

Revisions

  1. @tarekziade tarekziade revised this gist May 23, 2016. 1 changed file with 21 additions and 57 deletions.
    78 changes: 21 additions & 57 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -22,8 +22,6 @@
    Consistent Hashing implementation inspired by:
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example
    XXX Add credits for the murmur hash
    Also, good read on hashes:
    http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633
    @@ -33,12 +31,29 @@
    from collections import defaultdict
    import binascii
    import time
    from functools import wraps


    class CollisionError(Exception):
    pass


    _collisions = {}


    def catch_collision(func):
    @wraps(func)
    def _catch(key):
    res = func(key)
    if res in _collisions and key != _collisions[res]:
    raise CollisionError('%s and %s with %s' % (key, _collisions[res],
    func))
    _collisions[res] = key
    return res
    return _catch


    @catch_collision
    def fnv32a(key):
    hval = 0x811c9dc5
    fnv_32_prime = 0x01000193
    @@ -49,73 +64,25 @@ def fnv32a(key):
    return hval


    @catch_collision
    def sha512(key):
    return long(hashlib.sha512(key).hexdigest(), 16)


    @catch_collision
    def sha256(key):
    return long(hashlib.sha256(key).hexdigest(), 16)


    @catch_collision
    def md5(key):
    return long(hashlib.md5(key).hexdigest(), 16)



    MASK = 2 ** 64 - 1
    _64 = 8 * 8

    def murmur64(data, seed=19820125):
    m = 0xc6a4a7935bd1e995
    r = 47
    data = bytearray(data)
    h = seed ^ ((m * len(data)) & MASK)
    off = len(data) / _64

    def b2l(bytes):
    return sum((b << (k * 8) for k, b in enumerate(bytes)))

    for ll in range(0, off, 8):
    k = b2l(data[ll:ll + 8])
    k = (k * m) & MASK
    k = k ^ ((k >> r) & MASK)
    k = (k * m) & MASK
    h = (h ^ k)
    h = (h * m) & MASK

    l = len(data) & 7

    if l >= 7:
    h = (h ^ (data[off+6] << 48))

    if l >= 6:
    h = (h ^ (data[off+5] << 40))

    if l >= 5:
    h = (h ^ (data[off+4] << 32))

    if l >= 4:
    h = (h ^ (data[off+3] << 24))

    if l >= 3:
    h = (h ^ (data[off+2] << 16))

    if l >= 2:
    h = (h ^ (data[off+1] << 8))

    if l >= 1:
    h = (h ^ data[off])
    h = (h * m) & MASK

    h = h ^ ((h >> r) & MASK)
    h = (h * m) & MASK
    h = h ^ ((h >> r) & MASK)
    return h


    class RendezVous(object):

    def __init__(self, ips=None, hash=murmur64):
    def __init__(self, ips=None, hash=md5):
    if ips is None:
    ips = []
    self.ips = ips
    @@ -165,9 +132,6 @@ def add(self, ip):
    for i in range(self.replicas):
    sip = _repl(ip, i)
    hashed = self._hash(sip)
    if hashed in self._ips:
    raise CollisionError('Collision! %s vs %s with %s hash' % (sip,
    self._ips[hashed], self._hash))
    self._ips[hashed] = sip
    bisect.insort(self._hashed_ips, hashed)

  2. @tarekziade tarekziade revised this gist May 18, 2016. 1 changed file with 33 additions and 11 deletions.
    44 changes: 33 additions & 11 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -32,6 +32,11 @@
    import bisect
    from collections import defaultdict
    import binascii
    import time


    class CollisionError(Exception):
    pass


    def fnv32a(key):
    @@ -44,6 +49,10 @@ def fnv32a(key):
    return hval


    def sha512(key):
    return long(hashlib.sha512(key).hexdigest(), 16)


    def sha256(key):
    return long(hashlib.sha256(key).hexdigest(), 16)

    @@ -52,21 +61,22 @@ def md5(key):
    return long(hashlib.md5(key).hexdigest(), 16)


    def bytes_to_long(bytes):
    assert len(bytes) == 8
    return sum((b << (k * 8) for k, b in enumerate(bytes)))

    MASK = 2 ** 64 - 1
    _64 = 8 * 8

    def murmur64(data, seed=19820125):
    m = 0xc6a4a7935bd1e995
    r = 47
    MASK = 2 ** 64 - 1
    data = bytearray(data)
    h = seed ^ ((m * len(data)) & MASK)
    off = len(data) / _64

    def b2l(bytes):
    return sum((b << (k * 8) for k, b in enumerate(bytes)))

    off = len(data) / 8 * 8
    for ll in range(0, off, 8):
    k = bytes_to_long(data[ll:ll + 8])
    k = b2l(data[ll:ll + 8])
    k = (k * m) & MASK
    k = k ^ ((k >> r) & MASK)
    k = (k * m) & MASK
    @@ -100,7 +110,6 @@ def murmur64(data, seed=19820125):
    h = h ^ ((h >> r) & MASK)
    h = (h * m) & MASK
    h = h ^ ((h >> r) & MASK)

    return h


    @@ -140,7 +149,7 @@ def _repl(name, index):

    class ConsistentHashing(object):

    def __init__(self, ips=[], replicas=100, hash=md5):
    def __init__(self, ips=[], replicas=200, hash=md5):
    self._ips = {}
    self._hashed_ips = []
    self.replicas = replicas
    @@ -156,6 +165,9 @@ def add(self, ip):
    for i in range(self.replicas):
    sip = _repl(ip, i)
    hashed = self._hash(sip)
    if hashed in self._ips:
    raise CollisionError('Collision! %s vs %s with %s hash' % (sip,
    self._ips[hashed], self._hash))
    self._ips[hashed] = sip
    bisect.insort(self._hashed_ips, hashed)

    @@ -227,9 +239,19 @@ def run_test(servers, users):
    'postgres1']

    for klass in (ConsistentHashing, RendezVous):
    for hash in (md5, murmur64, sha256, binascii.crc32, fnv32a):
    cluster = klass(list(servers), hash=hash)
    for hash in (md5, sha256, binascii.crc32, fnv32a, sha512):
    try:
    cluster = klass(list(servers), hash=hash)
    except CollisionError:
    print('Collision error with hash %s' % hash)
    continue

    print(cluster)
    run_test(cluster, users)
    start = time.time()
    try:
    run_test(cluster, users)
    except CollisionError:
    print('Collision error..')
    print('Took %d seconds' % (time.time() - start))
    print
    print
  3. @tarekziade tarekziade revised this gist May 18, 2016. 1 changed file with 13 additions and 2 deletions.
    15 changes: 13 additions & 2 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -31,10 +31,21 @@
    import hashlib
    import bisect
    from collections import defaultdict
    import binascii


    def fnv32a(key):
    hval = 0x811c9dc5
    fnv_32_prime = 0x01000193
    uint32_max = 2 ** 32
    for s in key:
    hval = hval ^ ord(s)
    hval = (hval * fnv_32_prime) % uint32_max
    return hval


    def sha256(key):
    return long(hashlib.md5(key).hexdigest(), 16)
    return long(hashlib.sha256(key).hexdigest(), 16)


    def md5(key):
    @@ -216,7 +227,7 @@ def run_test(servers, users):
    'postgres1']

    for klass in (ConsistentHashing, RendezVous):
    for hash in (md5, murmur64, sha256):
    for hash in (md5, murmur64, sha256, binascii.crc32, fnv32a):
    cluster = klass(list(servers), hash=hash)
    print(cluster)
    run_test(cluster, users)
  4. @tarekziade tarekziade revised this gist May 18, 2016. 1 changed file with 11 additions and 6 deletions.
    17 changes: 11 additions & 6 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -2,16 +2,24 @@
    We have a few servers and we want a load-balancer to
    distribute incoming requests across them in a deterministic
    and consistent way.
    and consistent way - without keeping any counter to make the
    decision.
    Removing a backend server should not impact users on other
    servers.
    Adding a backend will generate the redistribution of
    users on the nearest 2 servers.
    users across other servers.
    Consistent Hashing inspired by:
    The goal is to come up with the best algorithm
    for 1M users across 5 servers. Speed is a bonus.
    Two known techniques:
    - RendezVous : https://en.wikipedia.org/wiki/Rendezvous_hashing#Comparison_With_Consistent_Hashing
    - Consistent Hashing: https://en.wikipedia.org/wiki/Consistent_hashing
    Consistent Hashing implementation inspired by:
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example
    XXX Add credits for the murmur hash
    @@ -20,13 +28,11 @@
    http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633
    """
    import sys
    import hashlib
    import bisect
    from collections import defaultdict



    def sha256(key):
    return long(hashlib.md5(key).hexdigest(), 16)

    @@ -117,7 +123,6 @@ def select(self, key):
    return winner



    def _repl(name, index):
    return '%s:%d' % (name, index)

  5. @tarekziade tarekziade revised this gist May 18, 2016. 1 changed file with 33 additions and 4 deletions.
    37 changes: 33 additions & 4 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -15,13 +15,22 @@
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example
    XXX Add credits for the murmur hash
    Also, good read on hashes:
    http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633
    """
    import sys
    import hashlib
    import bisect
    from collections import defaultdict



    def sha256(key):
    return long(hashlib.md5(key).hexdigest(), 16)


    def md5(key):
    return long(hashlib.md5(key).hexdigest(), 16)

    @@ -160,8 +169,18 @@ def run_test(servers, users):

    print '===='
    print('Distribution')
    smallest = NUM_USERS + 1
    biggest = 0
    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))
    size = len(selection[db])
    if size < smallest:
    smallest = size
    if size > biggest:
    biggest = size

    print('%d users in %s' % (size, db))

    print('span: %d' % (biggest - smallest))

    # removing server 2 and 4
    servers.remove('postgres2')
    @@ -172,8 +191,18 @@ def run_test(servers, users):
    user_db = servers.select(user)
    selection[user_db].append(user)

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))
    smallest = NUM_USERS + 1
    biggest = 0
    for i, db in enumerate(selection):
    size = len(selection[db])
    if size < smallest:
    smallest = size
    if size > biggest:
    biggest = size

    print('%d users in %s' % (size, db))

    print('span: %d' % (biggest - smallest))


    if __name__ == '__main__':
    @@ -182,7 +211,7 @@ def run_test(servers, users):
    'postgres1']

    for klass in (ConsistentHashing, RendezVous):
    for hash in (md5, murmur64):
    for hash in (md5, murmur64, sha256):
    cluster = klass(list(servers), hash=hash)
    print(cluster)
    run_test(cluster, users)
  6. @tarekziade tarekziade revised this gist May 17, 2016. 1 changed file with 114 additions and 13 deletions.
    127 changes: 114 additions & 13 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -10,63 +10,150 @@
    Adding a backend will generate the redistribution of
    users on the nearest 2 servers.
    inspired by:
    Consistent Hashing inspired by:
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example
    XXX Add credits for the murmur hash
    """
    import sys
    import hashlib
    import bisect
    from collections import defaultdict


    def _hash(key):
    def md5(key):
    return long(hashlib.md5(key).hexdigest(), 16)


    def bytes_to_long(bytes):
    assert len(bytes) == 8
    return sum((b << (k * 8) for k, b in enumerate(bytes)))


    def murmur64(data, seed=19820125):
    m = 0xc6a4a7935bd1e995
    r = 47
    MASK = 2 ** 64 - 1
    data = bytearray(data)
    h = seed ^ ((m * len(data)) & MASK)

    off = len(data) / 8 * 8
    for ll in range(0, off, 8):
    k = bytes_to_long(data[ll:ll + 8])
    k = (k * m) & MASK
    k = k ^ ((k >> r) & MASK)
    k = (k * m) & MASK
    h = (h ^ k)
    h = (h * m) & MASK

    l = len(data) & 7

    if l >= 7:
    h = (h ^ (data[off+6] << 48))

    if l >= 6:
    h = (h ^ (data[off+5] << 40))

    if l >= 5:
    h = (h ^ (data[off+4] << 32))

    if l >= 4:
    h = (h ^ (data[off+3] << 24))

    if l >= 3:
    h = (h ^ (data[off+2] << 16))

    if l >= 2:
    h = (h ^ (data[off+1] << 8))

    if l >= 1:
    h = (h ^ data[off])
    h = (h * m) & MASK

    h = h ^ ((h >> r) & MASK)
    h = (h * m) & MASK
    h = h ^ ((h >> r) & MASK)

    return h


    class RendezVous(object):

    def __init__(self, ips=None, hash=murmur64):
    if ips is None:
    ips = []
    self.ips = ips
    self._hash = hash

    def __str__(self):
    return '<RendezVous with %s hash>' % self._hash

    def add(self, ip):
    self.ips.append(ip)

    def remove(self, ip):
    self.ips.remove(ip)

    def select(self, key):
    high_score = -1
    winner = None
    for ip in self.ips:
    score = self._hash("%s-%s" % (str(ip), str(key)))
    if score > high_score:
    high_score, winner = score, ip

    elif score == high_score:
    high_score, winner = score, max(str(ip), str(winner))
    return winner



    def _repl(name, index):
    return '%s:%d' % (name, index)


    class Servers(object):
    class ConsistentHashing(object):

    def __init__(self, ips=[], replicas=100):
    def __init__(self, ips=[], replicas=100, hash=md5):
    self._ips = {}
    self._hashed_ips = []
    self.replicas = replicas
    self._hash = hash

    for ip in ips:
    self.add(ip)

    def __str__(self):
    return '<ConsistentHashing with %s hash>' % self._hash

    def add(self, ip):
    for i in range(self.replicas):
    sip = _repl(ip, i)
    hashed = _hash(sip)
    hashed = self._hash(sip)
    self._ips[hashed] = sip
    bisect.insort(self._hashed_ips, hashed)

    def remove(self, ip):
    for i in range(self.replicas):
    sip = _repl(ip, i)
    hashed = _hash(sip)
    hashed = self._hash(sip)
    del self._ips[hashed]
    index = bisect.bisect_left(self._hashed_ips, hashed)
    del self._hashed_ips[index]

    def select(self, username):
    hashed = _hash(username)
    hashed = self._hash(username)
    start = bisect.bisect(self._hashed_ips, hashed,
    hi=len(self._hashed_ips)-1)
    return self._ips[self._hashed_ips[start]].split(':')[0]


    NUM_USERS = 1000000

    if __name__ == '__main__':
    selection = defaultdict(list)
    servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1'])

    users = ['%06d' % i for i in range(NUM_USERS)]
    users = users
    def run_test(servers, users):
    selection = defaultdict(list)
    for user in users:
    user_db = servers.select(user)
    selection[user_db].append(user)
    @@ -87,3 +174,17 @@ def select(self, username):

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))


    if __name__ == '__main__':
    users = ['%06d' % i for i in range(NUM_USERS)]
    servers = ['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1']

    for klass in (ConsistentHashing, RendezVous):
    for hash in (md5, murmur64):
    cluster = klass(list(servers), hash=hash)
    print(cluster)
    run_test(cluster, users)
    print
    print
  7. @tarekziade tarekziade revised this gist May 15, 2016. 1 changed file with 12 additions and 19 deletions.
    31 changes: 12 additions & 19 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -11,18 +11,20 @@
    users on the nearest 2 servers.
    inspired by:
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example
    """
    import hashlib
    import bisect
    import random
    import pprint

    from collections import defaultdict


    _hash = hashlib.md5
    def _hash(key):
    return long(hashlib.md5(key).hexdigest(), 16)


    def _repl(name, index):
    return '%s:%d' % (name, index)


    class Servers(object):
    @@ -36,28 +38,21 @@ def __init__(self, ips=[], replicas=100):

    def add(self, ip):
    for i in range(self.replicas):
    sip = ip + ':' + str(i)
    hashed = self._hash(sip)
    sip = _repl(ip, i)
    hashed = _hash(sip)
    self._ips[hashed] = sip
    bisect.insort(self._hashed_ips, hashed)

    def remove(self, ip):
    for i in range(self.replicas):
    sip = ip + ':' + str(i)
    hashed = self._hash(sip)
    sip = _repl(ip, i)
    hashed = _hash(sip)
    del self._ips[hashed]
    index = bisect.bisect_left(self._hashed_ips, hashed)
    del self._hashed_ips[index]

    def _hash(self, key):
    def _hexhash(key):
    return _hash(key).hexdigest()

    hash = _hexhash(key)
    return long(hash, 16)

    def select(self, username):
    hashed = self._hash(username)
    hashed = _hash(username)
    start = bisect.bisect(self._hashed_ips, hashed,
    hi=len(self._hashed_ips)-1)
    return self._ips[self._hashed_ips[start]].split(':')[0]
    @@ -70,7 +65,6 @@ def select(self, username):
    servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1'])


    users = ['%06d' % i for i in range(NUM_USERS)]
    users = users
    for user in users:
    @@ -93,4 +87,3 @@ def select(self, username):

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))

  8. @tarekziade tarekziade revised this gist May 14, 2016. 1 changed file with 16 additions and 2 deletions.
    18 changes: 16 additions & 2 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,19 @@
    # inspired by
    # http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/
    """ Consistent load-balancing.
    We have a few servers and we want a load-balancer to
    distribute incoming requests across them in a deterministic
    and consistent way.
    Removing a backend server should not impact users on other
    servers.
    Adding a backend will generate the redistribution of
    users on the nearest 2 servers.
    inspired by:
    http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/
    """
    import hashlib
    import bisect
    import random
  9. @tarekziade tarekziade revised this gist May 14, 2016. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -49,15 +49,15 @@ def select(self, username):
    return self._ips[self._hashed_ips[start]].split(':')[0]


    NUM_USERS = 1000000

    if __name__ == '__main__':
    selection = defaultdict(list)
    servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1'])

    #print('Hash ring')
    #pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips])

    users = ['%06d' % i for i in range(100000)]
    users = ['%06d' % i for i in range(NUM_USERS)]
    users = users
    for user in users:
    user_db = servers.select(user)
  10. @tarekziade tarekziade revised this gist May 14, 2016. 1 changed file with 16 additions and 11 deletions.
    27 changes: 16 additions & 11 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -13,22 +13,27 @@

    class Servers(object):

    def __init__(self, ips=[]):
    def __init__(self, ips=[], replicas=100):
    self._ips = {}
    self._hashed_ips = []
    self.replicas = replicas
    for ip in ips:
    self.add(ip)

    def add(self, ip):
    hashed = self._hash(ip)
    self._ips[hashed] = ip
    bisect.insort(self._hashed_ips, hashed)
    for i in range(self.replicas):
    sip = ip + ':' + str(i)
    hashed = self._hash(sip)
    self._ips[hashed] = sip
    bisect.insort(self._hashed_ips, hashed)

    def remove(self, ip):
    hashed = self._hash(ip)
    del self._ips[hashed]
    index = bisect.bisect_left(self._hashed_ips, hashed)
    del self._hashed_ips[index]
    for i in range(self.replicas):
    sip = ip + ':' + str(i)
    hashed = self._hash(sip)
    del self._ips[hashed]
    index = bisect.bisect_left(self._hashed_ips, hashed)
    del self._hashed_ips[index]

    def _hash(self, key):
    def _hexhash(key):
    @@ -41,16 +46,16 @@ def select(self, username):
    hashed = self._hash(username)
    start = bisect.bisect(self._hashed_ips, hashed,
    hi=len(self._hashed_ips)-1)
    return self._ips[self._hashed_ips[start]]
    return self._ips[self._hashed_ips[start]].split(':')[0]


    if __name__ == '__main__':
    selection = defaultdict(list)
    servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1'])

    print('Hash ring')
    pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips])
    #print('Hash ring')
    #pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips])

    users = ['%06d' % i for i in range(100000)]
    users = users
  11. @tarekziade tarekziade revised this gist May 13, 2016. 1 changed file with 19 additions and 9 deletions.
    28 changes: 19 additions & 9 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -3,10 +3,14 @@
    import hashlib
    import bisect
    import random
    import pprint

    from collections import defaultdict


    _hash = hashlib.md5


    class Servers(object):

    def __init__(self, ips=[]):
    @@ -28,34 +32,39 @@ def remove(self, ip):

    def _hash(self, key):
    def _hexhash(key):
    return hashlib.md5(key).hexdigest()
    return _hash(key).hexdigest()

    hash = _hexhash(key)
    return long(hash, 16)

    def select(self, username):
    hashed = self._hash(username)
    start = bisect.bisect(self._hashed_ips, hashed)
    if start == len(self._hashed_ips):
    start = 0
    start = bisect.bisect(self._hashed_ips, hashed,
    hi=len(self._hashed_ips)-1)
    return self._ips[self._hashed_ips[start]]


    if __name__ == '__main__':
    selection = defaultdict(list)
    servers = Servers(['postgres1', 'postgres2', 'postgres3', 'postgres4',
    'postgres5'])
    users = ['%06d' % i for i in range(100000)]
    servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4',
    'postgres1'])

    print('Hash ring')
    pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips])

    users = ['%06d' % i for i in range(100000)]
    users = users
    for user in users:
    user_db = servers.select(user)
    selection[user_db].append(user)

    print '===='
    print('Distribution')
    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))

    # removing server 2 and 4
    servers.remove('postgres2')
    servers.remove('postgres4')
    print '===='

    selection = defaultdict(list)
    @@ -64,4 +73,5 @@ def select(self, username):
    selection[user_db].append(user)

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))
    print('%d users in %s' % (len(selection[db]), db))

  12. @tarekziade tarekziade revised this gist May 13, 2016. No changes.
  13. @tarekziade tarekziade created this gist May 13, 2016.
    67 changes: 67 additions & 0 deletions distribution.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    # inspired by
    # http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/
    import hashlib
    import bisect
    import random

    from collections import defaultdict


    class Servers(object):

    def __init__(self, ips=[]):
    self._ips = {}
    self._hashed_ips = []
    for ip in ips:
    self.add(ip)

    def add(self, ip):
    hashed = self._hash(ip)
    self._ips[hashed] = ip
    bisect.insort(self._hashed_ips, hashed)

    def remove(self, ip):
    hashed = self._hash(ip)
    del self._ips[hashed]
    index = bisect.bisect_left(self._hashed_ips, hashed)
    del self._hashed_ips[index]

    def _hash(self, key):
    def _hexhash(key):
    return hashlib.md5(key).hexdigest()
    hash = _hexhash(key)
    return long(hash, 16)

    def select(self, username):
    hashed = self._hash(username)
    start = bisect.bisect(self._hashed_ips, hashed)
    if start == len(self._hashed_ips):
    start = 0
    return self._ips[self._hashed_ips[start]]


    if __name__ == '__main__':
    selection = defaultdict(list)
    servers = Servers(['postgres1', 'postgres2', 'postgres3', 'postgres4',
    'postgres5'])
    users = ['%06d' % i for i in range(100000)]

    for user in users:
    user_db = servers.select(user)
    selection[user_db].append(user)

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))

    # removing server 2 and 4
    servers.remove('postgres2')
    servers.remove('postgres4')
    print '===='

    selection = defaultdict(list)
    for user in users:
    user_db = servers.select(user)
    selection[user_db].append(user)

    for db in selection:
    print('%d users in %s' % (len(selection[db]), db))