Skip to content

Instantly share code, notes, and snippets.

@softwaredoug
Created February 3, 2015 04:01
Show Gist options
  • Save softwaredoug/aea69681f538d9780c82 to your computer and use it in GitHub Desktop.
Save softwaredoug/aea69681f538d9780c82 to your computer and use it in GitHub Desktop.

Revisions

  1. softwaredoug created this gist Feb 3, 2015.
    87 changes: 87 additions & 0 deletions hyperLogLog.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,87 @@
    #include "stdio.h"
    #include "stdlib.h"

    unsigned int max(unsigned int a, unsigned int b) {
    return a > b ? a : b;
    }

    unsigned int bucket(unsigned int val) {
    return val & 0xF0000000 >> 28;
    }

    unsigned int trailingZeros(unsigned int val) {
    unsigned int mask = 0x00000001;
    unsigned int cnt = 0;
    while (val) {
    val = val & mask;
    mask <<= 1;
    cnt++;
    }
    return cnt;
    }

    void initBuckets(unsigned int* buckets) {
    int i;
    for (i = 0; i < 16; i++) {
    buckets[i] = 0;
    }
    }

    unsigned int cardinality(unsigned int* buckets) {
    int i;
    unsigned int cnt = 0;
    for (i = 0; i < 16; i++) {
    if (buckets[i]) {
    cnt += (1 << (buckets[i] - 1)); // 2^buckets[i]
    }
    }
    return cnt;

    }

    void diagnosticReport(unsigned int* buckets) {
    int i;
    for (i = 0; i < 16; i++) {
    printf("bucket %i: %08x\n", i, buckets[i]);
    }
    printf("Cardinality %i\n", cardinality(buckets));
    }

    int main() {
    /*zero 16 buckets*/
    unsigned int buckets[16];
    initBuckets(buckets);

    unsigned int inputNumber = 0;
    unsigned int tryNo = 0;
    while (1) {
    printf("%ith Value for HLL\n", tryNo);
    /*user enters a number,
    *
    * in real HLL we would hash the number, we're sorta expecting the user
    * to bash on keys*/
    scanf("%u", &inputNumber);

    /*
    * hash this number to a bucket based on the <numbuckets> significant
    * digits*/
    unsigned int buck = bucket(inputNumber);
    printf("Hashed To Bucket %i\n", buck);

    /* Write the largest number of trailing 0s we've seen thus far
    *
    * This is the magic of the hyperloglog, it gets very very hard
    * to get more trailing zeros after some point
    *
    * Its much like flipping a coin
    * */
    buckets[buck] = max(buckets[buck], trailingZeros(inputNumber));

    /* Print diagnistic, including cardinality. which is just a sum of
    * 2^bucketVal over all buckets if bucketVal != 0
    * */
    diagnosticReport(buckets);
    tryNo++;
    }

    }