#!/usr/bin/python from __future__ import division from bitarray import bitarray from hashlib import sha256 from math import log import struct def gcs_hash(w, (N,P)): h = sha256(w).hexdigest() h = long(h,16) return h % (N*P) def gcs_encode(V, P, out): Q = V // P R = V % P logp = int(log(P,2)) # write Q as unary out.extend([True for i in range(Q)] + [False]) # write log2(P) most significant bits of R out.extend([R & (1 << i) and True or False for i in range(logp-1,-1,-1)]) def main(urls, P, filename): N = len(urls) hash_values = [ gcs_hash(i, (N, P)) for i in urls ] hash_values.append(0) hash_values.sort() out = bitarray(endian="big") # network for i in range(len(hash_values) - 1): delta = hash_values[i+1] - hash_values[i] if delta == 0: continue gcs_encode(delta, P, out) result = out.tobytes() f = open(filename, 'wb') # write N and P as unsigned long integers f.write(struct.pack("!LL", N, P)) f.write(result) print "* size:", f.tell() f.close() if __name__ == "__main__": urls = [str(n) for n in xrange(1000)] P = 2**8 main(urls, P, 'table.gcs')