-
-
Save xtao/b2e918c7d0dc092bbab24637ef7ae6bc to your computer and use it in GitHub Desktop.
Revisions
-
tarekziade revised this gist
May 23, 2016 . 1 changed file with 21 additions and 57 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -22,8 +22,6 @@ Consistent Hashing implementation inspired by: http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example Also, good read on hashes: http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633 @@ -33,12 +31,29 @@ from collections import defaultdict import binascii import time from functools import wraps class CollisionError(Exception): pass _collisions = {} def catch_collision(func): @wraps(func) def _catch(key): res = func(key) if res in _collisions and key != _collisions[res]: raise CollisionError('%s and %s with %s' % (key, _collisions[res], func)) _collisions[res] = key return res return _catch @catch_collision def fnv32a(key): hval = 0x811c9dc5 fnv_32_prime = 0x01000193 @@ -49,73 +64,25 @@ def fnv32a(key): return hval @catch_collision def sha512(key): return long(hashlib.sha512(key).hexdigest(), 16) @catch_collision def sha256(key): return long(hashlib.sha256(key).hexdigest(), 16) @catch_collision def md5(key): return long(hashlib.md5(key).hexdigest(), 16) class RendezVous(object): def __init__(self, ips=None, hash=md5): if ips is None: ips = [] self.ips = ips @@ -165,9 +132,6 @@ def add(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = self._hash(sip) self._ips[hashed] = sip bisect.insort(self._hashed_ips, hashed) -
tarekziade revised this gist
May 18, 2016 . 1 changed file with 33 additions and 11 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -32,6 +32,11 @@ import bisect from collections import defaultdict import binascii import time class CollisionError(Exception): pass def fnv32a(key): @@ -44,6 +49,10 @@ def fnv32a(key): return hval def sha512(key): return long(hashlib.sha512(key).hexdigest(), 16) def sha256(key): return long(hashlib.sha256(key).hexdigest(), 16) @@ -52,21 +61,22 @@ def md5(key): return long(hashlib.md5(key).hexdigest(), 16) MASK = 2 ** 64 - 1 _64 = 8 * 8 def murmur64(data, seed=19820125): m = 0xc6a4a7935bd1e995 r = 47 data = bytearray(data) h = seed ^ ((m * len(data)) & MASK) off = len(data) / _64 def b2l(bytes): return sum((b << (k * 8) for k, b in enumerate(bytes))) for ll in range(0, off, 8): k = b2l(data[ll:ll + 8]) k = (k * m) & MASK k = k ^ ((k >> r) & MASK) k = (k * m) & MASK @@ -100,7 +110,6 @@ def murmur64(data, seed=19820125): h = h ^ ((h >> r) & MASK) h = (h * m) & MASK h = h ^ ((h >> r) & MASK) return h @@ -140,7 +149,7 @@ def _repl(name, index): class ConsistentHashing(object): def __init__(self, ips=[], replicas=200, hash=md5): self._ips = {} self._hashed_ips = [] self.replicas = replicas @@ -156,6 +165,9 @@ def add(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = self._hash(sip) if hashed in self._ips: raise CollisionError('Collision! %s vs %s with %s hash' % (sip, self._ips[hashed], self._hash)) self._ips[hashed] = sip bisect.insort(self._hashed_ips, hashed) @@ -227,9 +239,19 @@ def run_test(servers, users): 'postgres1'] for klass in (ConsistentHashing, RendezVous): for hash in (md5, sha256, binascii.crc32, fnv32a, sha512): try: cluster = klass(list(servers), hash=hash) except CollisionError: print('Collision error with hash %s' % hash) continue print(cluster) start = time.time() try: run_test(cluster, users) except CollisionError: print('Collision error..') print('Took %d seconds' % (time.time() - start)) print print -
tarekziade revised this gist
May 18, 2016 . 1 changed file with 13 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -31,10 +31,21 @@ import hashlib import bisect from collections import defaultdict import binascii def fnv32a(key): hval = 0x811c9dc5 fnv_32_prime = 0x01000193 uint32_max = 2 ** 32 for s in key: hval = hval ^ ord(s) hval = (hval * fnv_32_prime) % uint32_max return hval def sha256(key): return long(hashlib.sha256(key).hexdigest(), 16) def md5(key): @@ -216,7 +227,7 @@ def run_test(servers, users): 'postgres1'] for klass in (ConsistentHashing, RendezVous): for hash in (md5, murmur64, sha256, binascii.crc32, fnv32a): cluster = klass(list(servers), hash=hash) print(cluster) run_test(cluster, users) -
tarekziade revised this gist
May 18, 2016 . 1 changed file with 11 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,16 +2,24 @@ We have a few servers and we want a load-balancer to distribute incoming requests across them in a deterministic and consistent way - without keeping any counter to make the decision. Removing a backend server should not impact users on other servers. Adding a backend will generate the redistribution of users across other servers. The goal is to come up with the best algorithm for 1M users across 5 servers. Speed is a bonus. Two known techniques: - RendezVous : https://en.wikipedia.org/wiki/Rendezvous_hashing#Comparison_With_Consistent_Hashing - Consistent Hashing: https://en.wikipedia.org/wiki/Consistent_hashing Consistent Hashing implementation inspired by: http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example XXX Add credits for the murmur hash @@ -20,13 +28,11 @@ http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633 """ import hashlib import bisect from collections import defaultdict def sha256(key): return long(hashlib.md5(key).hexdigest(), 16) @@ -117,7 +123,6 @@ def select(self, key): return winner def _repl(name, index): return '%s:%d' % (name, index) -
tarekziade revised this gist
May 18, 2016 . 1 changed file with 33 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -15,13 +15,22 @@ http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example XXX Add credits for the murmur hash Also, good read on hashes: http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed/145633#145633 """ import sys import hashlib import bisect from collections import defaultdict def sha256(key): return long(hashlib.md5(key).hexdigest(), 16) def md5(key): return long(hashlib.md5(key).hexdigest(), 16) @@ -160,8 +169,18 @@ def run_test(servers, users): print '====' print('Distribution') smallest = NUM_USERS + 1 biggest = 0 for db in selection: size = len(selection[db]) if size < smallest: smallest = size if size > biggest: biggest = size print('%d users in %s' % (size, db)) print('span: %d' % (biggest - smallest)) # removing server 2 and 4 servers.remove('postgres2') @@ -172,8 +191,18 @@ def run_test(servers, users): user_db = servers.select(user) selection[user_db].append(user) smallest = NUM_USERS + 1 biggest = 0 for i, db in enumerate(selection): size = len(selection[db]) if size < smallest: smallest = size if size > biggest: biggest = size print('%d users in %s' % (size, db)) print('span: %d' % (biggest - smallest)) if __name__ == '__main__': @@ -182,7 +211,7 @@ def run_test(servers, users): 'postgres1'] for klass in (ConsistentHashing, RendezVous): for hash in (md5, murmur64, sha256): cluster = klass(list(servers), hash=hash) print(cluster) run_test(cluster, users) -
tarekziade revised this gist
May 17, 2016 . 1 changed file with 114 additions and 13 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,63 +10,150 @@ Adding a backend will generate the redistribution of users on the nearest 2 servers. Consistent Hashing inspired by: http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example XXX Add credits for the murmur hash """ import sys import hashlib import bisect from collections import defaultdict def md5(key): return long(hashlib.md5(key).hexdigest(), 16) def bytes_to_long(bytes): assert len(bytes) == 8 return sum((b << (k * 8) for k, b in enumerate(bytes))) def murmur64(data, seed=19820125): m = 0xc6a4a7935bd1e995 r = 47 MASK = 2 ** 64 - 1 data = bytearray(data) h = seed ^ ((m * len(data)) & MASK) off = len(data) / 8 * 8 for ll in range(0, off, 8): k = bytes_to_long(data[ll:ll + 8]) k = (k * m) & MASK k = k ^ ((k >> r) & MASK) k = (k * m) & MASK h = (h ^ k) h = (h * m) & MASK l = len(data) & 7 if l >= 7: h = (h ^ (data[off+6] << 48)) if l >= 6: h = (h ^ (data[off+5] << 40)) if l >= 5: h = (h ^ (data[off+4] << 32)) if l >= 4: h = (h ^ (data[off+3] << 24)) if l >= 3: h = (h ^ (data[off+2] << 16)) if l >= 2: h = (h ^ (data[off+1] << 8)) if l >= 1: h = (h ^ data[off]) h = (h * m) & MASK h = h ^ ((h >> r) & MASK) h = (h * m) & MASK h = h ^ ((h >> r) & MASK) return h class RendezVous(object): def __init__(self, ips=None, hash=murmur64): if ips is None: ips = [] self.ips = ips self._hash = hash def __str__(self): return '<RendezVous with %s hash>' % self._hash def add(self, ip): self.ips.append(ip) def remove(self, ip): self.ips.remove(ip) def select(self, key): high_score = -1 winner = None for ip in self.ips: score = self._hash("%s-%s" % (str(ip), str(key))) if score > high_score: high_score, winner = score, ip elif score == high_score: high_score, winner = score, max(str(ip), str(winner)) return winner def _repl(name, index): return '%s:%d' % (name, index) class ConsistentHashing(object): def __init__(self, ips=[], replicas=100, hash=md5): self._ips = {} self._hashed_ips = [] self.replicas = replicas self._hash = hash for ip in ips: self.add(ip) def __str__(self): return '<ConsistentHashing with %s hash>' % self._hash def add(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = self._hash(sip) self._ips[hashed] = sip bisect.insort(self._hashed_ips, hashed) def remove(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = self._hash(sip) del self._ips[hashed] index = bisect.bisect_left(self._hashed_ips, hashed) del self._hashed_ips[index] def select(self, username): hashed = self._hash(username) start = bisect.bisect(self._hashed_ips, hashed, hi=len(self._hashed_ips)-1) return self._ips[self._hashed_ips[start]].split(':')[0] NUM_USERS = 1000000 def run_test(servers, users): selection = defaultdict(list) for user in users: user_db = servers.select(user) selection[user_db].append(user) @@ -87,3 +174,17 @@ def select(self, username): for db in selection: print('%d users in %s' % (len(selection[db]), db)) if __name__ == '__main__': users = ['%06d' % i for i in range(NUM_USERS)] servers = ['postgres5', 'postgres2', 'postgres3', 'postgres4', 'postgres1'] for klass in (ConsistentHashing, RendezVous): for hash in (md5, murmur64): cluster = klass(list(servers), hash=hash) print(cluster) run_test(cluster, users) print print -
tarekziade revised this gist
May 15, 2016 . 1 changed file with 12 additions and 19 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,18 +11,20 @@ users on the nearest 2 servers. inspired by: http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example """ import hashlib import bisect from collections import defaultdict def _hash(key): return long(hashlib.md5(key).hexdigest(), 16) def _repl(name, index): return '%s:%d' % (name, index) class Servers(object): @@ -36,28 +38,21 @@ def __init__(self, ips=[], replicas=100): def add(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = _hash(sip) self._ips[hashed] = sip bisect.insort(self._hashed_ips, hashed) def remove(self, ip): for i in range(self.replicas): sip = _repl(ip, i) hashed = _hash(sip) del self._ips[hashed] index = bisect.bisect_left(self._hashed_ips, hashed) del self._hashed_ips[index] def select(self, username): hashed = _hash(username) start = bisect.bisect(self._hashed_ips, hashed, hi=len(self._hashed_ips)-1) return self._ips[self._hashed_ips[start]].split(':')[0] @@ -70,7 +65,6 @@ def select(self, username): servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4', 'postgres1']) users = ['%06d' % i for i in range(NUM_USERS)] users = users for user in users: @@ -93,4 +87,3 @@ def select(self, username): for db in selection: print('%d users in %s' % (len(selection[db]), db)) -
tarekziade revised this gist
May 14, 2016 . 1 changed file with 16 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,19 @@ """ Consistent load-balancing. We have a few servers and we want a load-balancer to distribute incoming requests across them in a deterministic and consistent way. Removing a backend server should not impact users on other servers. Adding a backend will generate the redistribution of users on the nearest 2 servers. inspired by: http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/ """ import hashlib import bisect import random -
tarekziade revised this gist
May 14, 2016 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -49,15 +49,15 @@ def select(self, username): return self._ips[self._hashed_ips[start]].split(':')[0] NUM_USERS = 1000000 if __name__ == '__main__': selection = defaultdict(list) servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4', 'postgres1']) users = ['%06d' % i for i in range(NUM_USERS)] users = users for user in users: user_db = servers.select(user) -
tarekziade revised this gist
May 14, 2016 . 1 changed file with 16 additions and 11 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,22 +13,27 @@ class Servers(object): def __init__(self, ips=[], replicas=100): self._ips = {} self._hashed_ips = [] self.replicas = replicas for ip in ips: self.add(ip) def add(self, ip): for i in range(self.replicas): sip = ip + ':' + str(i) hashed = self._hash(sip) self._ips[hashed] = sip bisect.insort(self._hashed_ips, hashed) def remove(self, ip): for i in range(self.replicas): sip = ip + ':' + str(i) hashed = self._hash(sip) del self._ips[hashed] index = bisect.bisect_left(self._hashed_ips, hashed) del self._hashed_ips[index] def _hash(self, key): def _hexhash(key): @@ -41,16 +46,16 @@ def select(self, username): hashed = self._hash(username) start = bisect.bisect(self._hashed_ips, hashed, hi=len(self._hashed_ips)-1) return self._ips[self._hashed_ips[start]].split(':')[0] if __name__ == '__main__': selection = defaultdict(list) servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4', 'postgres1']) #print('Hash ring') #pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips]) users = ['%06d' % i for i in range(100000)] users = users -
tarekziade revised this gist
May 13, 2016 . 1 changed file with 19 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,10 +3,14 @@ import hashlib import bisect import random import pprint from collections import defaultdict _hash = hashlib.md5 class Servers(object): def __init__(self, ips=[]): @@ -28,34 +32,39 @@ def remove(self, ip): def _hash(self, key): def _hexhash(key): return _hash(key).hexdigest() hash = _hexhash(key) return long(hash, 16) def select(self, username): hashed = self._hash(username) start = bisect.bisect(self._hashed_ips, hashed, hi=len(self._hashed_ips)-1) return self._ips[self._hashed_ips[start]] if __name__ == '__main__': selection = defaultdict(list) servers = Servers(['postgres5', 'postgres2', 'postgres3', 'postgres4', 'postgres1']) print('Hash ring') pprint.pprint([(servers._ips[hash], hash) for hash in servers._hashed_ips]) users = ['%06d' % i for i in range(100000)] users = users for user in users: user_db = servers.select(user) selection[user_db].append(user) print '====' print('Distribution') for db in selection: print('%d users in %s' % (len(selection[db]), db)) # removing server 2 and 4 servers.remove('postgres2') print '====' selection = defaultdict(list) @@ -64,4 +73,5 @@ def select(self, username): selection[user_db].append(user) for db in selection: print('%d users in %s' % (len(selection[db]), db)) -
tarekziade revised this gist
May 13, 2016 . No changes.There are no files selected for viewing
-
tarekziade created this gist
May 13, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,67 @@ # inspired by # http://techspot.zzzeek.org/2012/07/07/the-absolutely-simplest-consistent-hashing-example/ import hashlib import bisect import random from collections import defaultdict class Servers(object): def __init__(self, ips=[]): self._ips = {} self._hashed_ips = [] for ip in ips: self.add(ip) def add(self, ip): hashed = self._hash(ip) self._ips[hashed] = ip bisect.insort(self._hashed_ips, hashed) def remove(self, ip): hashed = self._hash(ip) del self._ips[hashed] index = bisect.bisect_left(self._hashed_ips, hashed) del self._hashed_ips[index] def _hash(self, key): def _hexhash(key): return hashlib.md5(key).hexdigest() hash = _hexhash(key) return long(hash, 16) def select(self, username): hashed = self._hash(username) start = bisect.bisect(self._hashed_ips, hashed) if start == len(self._hashed_ips): start = 0 return self._ips[self._hashed_ips[start]] if __name__ == '__main__': selection = defaultdict(list) servers = Servers(['postgres1', 'postgres2', 'postgres3', 'postgres4', 'postgres5']) users = ['%06d' % i for i in range(100000)] for user in users: user_db = servers.select(user) selection[user_db].append(user) for db in selection: print('%d users in %s' % (len(selection[db]), db)) # removing server 2 and 4 servers.remove('postgres2') servers.remove('postgres4') print '====' selection = defaultdict(list) for user in users: user_db = servers.select(user) selection[user_db].append(user) for db in selection: print('%d users in %s' % (len(selection[db]), db))