Skip to content

Instantly share code, notes, and snippets.

@tmcw
Created February 18, 2013 13:46
Show Gist options
  • Save tmcw/4977508 to your computer and use it in GitHub Desktop.
Save tmcw/4977508 to your computer and use it in GitHub Desktop.

Revisions

  1. tmcw created this gist Feb 18, 2013.
    1 change: 1 addition & 0 deletions README.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    Extracted from [simple-statistics](https://github.com/tmcw/simple-statistics).
    152 changes: 152 additions & 0 deletions jenks.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,152 @@
    // # [Jenks natural breaks optimization](http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
    //
    // Implementations: [1](http://danieljlewis.org/files/2010/06/Jenks.pdf) (python),
    // [2](https://github.com/vvoovv/djeo-jenks/blob/master/main.js) (buggy),
    // [3](https://github.com/simogeo/geostats/blob/master/lib/geostats.js#L407) (works)
    function jenks(data, n_classes) {

    // Compute the matrices required for Jenks breaks. These matrices
    // can be used for any classing of data with `classes <= n_classes`
    function getMatrices(data, n_classes) {

    // in the original implementation, these matrices are referred to
    // as `LC` and `OP`
    //
    // * lower_class_limits (LC): optimal lower class limits
    // * variance_combinations (OP): optimal variance combinations for all classes
    var lower_class_limits = [],
    variance_combinations = [],
    // loop counters
    i, j,
    // the variance, as computed at each step in the calculation
    variance = 0;

    // Initialize and fill each matrix with zeroes
    for (i = 0; i < data.length + 1; i++) {
    var tmp1 = [], tmp2 = [];
    for (j = 0; j < n_classes + 1; j++) {
    tmp1.push(0);
    tmp2.push(0);
    }
    lower_class_limits.push(tmp1);
    variance_combinations.push(tmp2);
    }

    for (i = 1; i < n_classes + 1; i++) {
    lower_class_limits[1][i] = 1;
    variance_combinations[1][i] = 0;
    // in the original implementation, 9999999 is used but
    // since Javascript has `Infinity`, we use that.
    for (j = 2; j < data.length + 1; j++) {
    variance_combinations[j][i] = Infinity;
    }
    }

    for (var l = 2; l < data.length + 1; l++) {

    // `SZ` originally. this is the sum of the values seen thus
    // far when calculating variance.
    var sum = 0,
    // `ZSQ` originally. the sum of squares of values seen
    // thus far
    sum_squares = 0,
    // `WT` originally. This is the number of
    w = 0,
    // `IV` originally
    i4 = 0;

    // in several instances, you could say `Math.pow(x, 2)`
    // instead of `x * x`, but this is slower in some browsers
    // introduces an unnecessary concept.
    for (var m = 1; m < l + 1; m++) {

    // `III` originally
    var lower_class_limit = l - m + 1,
    val = data[lower_class_limit - 1];

    // here we're estimating variance for each potential classing
    // of the data, for each potential number of classes. `w`
    // is the number of data points considered so far.
    w++;

    // increase the current sum and sum-of-squares
    sum += val;
    sum_squares += val * val;

    // the variance at this point in the sequence is the difference
    // between the sum of squares and the total x 2, over the number
    // of samples.
    variance = sum_squares - (sum * sum) / w;

    i4 = lower_class_limit - 1;

    if (i4 !== 0) {
    for (j = 2; j < n_classes + 1; j++) {
    // if adding this element to an existing class
    // will increase its variance beyond the limit, break
    // the class at this point, setting the lower_class_limit
    // at this point.
    if (variance_combinations[l][j] >=
    (variance + variance_combinations[i4][j - 1])) {
    lower_class_limits[l][j] = lower_class_limit;
    variance_combinations[l][j] = variance +
    variance_combinations[i4][j - 1];
    }
    }
    }
    }

    lower_class_limits[l][1] = 1;
    variance_combinations[l][1] = variance;
    }

    // return the two matrices. for just providing breaks, only
    // `lower_class_limits` is needed, but variances can be useful to
    // evaluage goodness of fit.
    return {
    lower_class_limits: lower_class_limits,
    variance_combinations: variance_combinations
    };
    }



    // the second part of the jenks recipe: take the calculated matrices
    // and derive an array of n breaks.
    function breaks(data, lower_class_limits, n_classes) {

    var k = data.length - 1,
    kclass = [],
    countNum = n_classes;

    // the calculation of classes will never include the upper and
    // lower bounds, so we need to explicitly set them
    kclass[n_classes] = data[data.length - 1];
    kclass[0] = data[0];

    // the lower_class_limits matrix is used as indexes into itself
    // here: the `k` variable is reused in each iteration.
    while (countNum > 1) {
    kclass[countNum - 1] = data[lower_class_limits[k][countNum] - 2];
    k = lower_class_limits[k][countNum] - 1;
    countNum--;
    }

    return kclass;
    }

    if (n_classes > data.length) return null;

    // sort data in numerical order, since this is expected
    // by the matrices function
    data = data.slice().sort(function (a, b) { return a - b; });

    // get our basic matrices
    var matrices = getMatrices(data, n_classes),
    // we only need lower class limits here
    lower_class_limits = matrices.lower_class_limits;

    // extract n_classes out of the computed matrices
    return breaks(data, lower_class_limits, n_classes);

    }