Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.

Select an option

Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.

Revisions

  1. Barrett Clark created this gist Dec 29, 2015.
    54 changes: 54 additions & 0 deletions Box Plot Quartiles (exclude median from Q1 and Q3)
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    WITH raw_data AS (
    SELECT jurisdictions AS county, median_value AS value
    FROM maryland_residential_sales_figures
    WHERE median_value > 999
    ), ntiles AS (
    SELECT county,
    value,
    ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number,
    COUNT(*) OVER (PARTITION BY county ) AS total,
    NTILE(2) OVER (PARTITION BY county ORDER BY value) AS bitile
    FROM raw_data
    ), medians AS (
    SELECT county, total,
    AVG(CASE WHEN total % 2 > 0
    THEN MAX(value) FILTER(WHERE bitile=1)
    ELSE (MAX(value) FILTER(WHERE bitile=1) + MIN(value) FILTER(WHERE bitile=2)) / 2.0
    END
    ) OVER (PARTITION BY county) as median
    FROM ntiles
    GROUP BY 1, 2
    ), quartiles AS (
    SELECT ntiles.county,
    MEDIAN(value) FILTER(WHERE value < median) as q1,
    AVG(median) AS median,
    MEDIAN(value) FILTER(WHERE value > median) as q3
    FROM ntiles
    JOIN medians on medians.county = ntiles.county
    GROUP BY 1
    ORDER BY 1
    )
    SELECT quartiles.county,
    ARRAY_TO_STRING(
    ARRAY_AGG(
    CASE WHEN value < q1 - ((q3-q1) * 1.5)
    THEN value::VARCHAR ELSE NULL END
    ), ',') AS lower_outliers,
    MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5)
    THEN value ELSE NULL END
    ) AS minimum,
    quartiles.q1,
    quartiles.median,
    quartiles.q3,
    MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5)
    THEN value ELSE NULL END
    ) AS maximum,
    ARRAY_TO_STRING(
    ARRAY_AGG(
    CASE WHEN value > q3 + ((q3-q1) * 1.5)
    THEN value::VARCHAR ELSE NULL END
    ), ',') AS upper_outliers
    FROM quartiles
    JOIN raw_data on quartiles.county = raw_data.county
    GROUP BY quartiles.county, quartiles.q1, quartiles.median, quartiles.q3
    ORDER BY quartiles.county
    27 changes: 27 additions & 0 deletions Create MEDIAN function
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,27 @@
    CREATE OR REPLACE FUNCTION _final_median(anyarray) RETURNS float8 AS $$
    WITH q AS
    (
    SELECT val
    FROM unnest($1) val
    WHERE VAL IS NOT NULL
    ORDER BY 1
    ),
    cnt AS
    (
    SELECT COUNT(*) AS c FROM q
    )
    SELECT AVG(val)::float8
    FROM
    (
    SELECT val FROM q
    LIMIT 2 - MOD((SELECT c FROM cnt), 2)
    OFFSET GREATEST(CEIL((SELECT c FROM cnt) / 2.0) - 1,0)
    ) q2;
    $$ LANGUAGE SQL IMMUTABLE;

    CREATE AGGREGATE median(anyelement) (
    SFUNC=array_append,
    STYPE=anyarray,
    FINALFUNC=_final_median,
    INITCOND='{}'
    );
    49 changes: 49 additions & 0 deletions Simple Box Plot Quartiles
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,49 @@
    -- https://blog.modeanalytics.com/how-to-make-box-and-whisker-plot-sql/
    WITH raw_data AS (
    SELECT jurisdictions AS county, median_value AS value
    FROM maryland_residential_sales_figures
    WHERE median_value > 999
    ), details AS (
    SELECT county,
    value,
    ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number,
    COUNT(*) OVER (PARTITION BY county) AS total
    FROM raw_data
    ), quartiles AS (
    -- NOTE: q1 and q3 could potentially include the median if it's repeated in the data
    SELECT county,
    value,
    AVG(CASE WHEN row_number BETWEEN FLOOR(total/2.0)/2.0 AND FLOOR(total/2.0)/2.0 + 1
    THEN value ELSE NULL END
    ) OVER (PARTITION BY county) as q1,
    AVG(CASE WHEN row_number BETWEEN total/2.0 AND total/2.0 + 1
    THEN value ELSE NULL END
    ) OVER (PARTITION BY county) AS median,
    AVG(CASE WHEN row_number BETWEEN CEIL(total/2.0) + FLOOR(total/2.0)/2.0 AND CEIL(total/2.0) + FLOOR(total/2.0)/2.0 + 1
    THEN value/1.0 ELSE NULL END
    ) OVER (PARTITION BY county) AS q3
    FROM details
    )
    SELECT county,
    ARRAY_TO_STRING(
    ARRAY_AGG(
    CASE WHEN value < q1 - ((q3-q1) * 1.5)
    THEN value::VARCHAR ELSE NULL END
    ), ',') AS lower_outliers,
    MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5)
    THEN value ELSE NULL END
    ) AS minimum,
    AVG(q1) AS q1,
    AVG(median) AS median,
    AVG(q3) AS q3,
    MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5)
    THEN value ELSE NULL END
    ) AS maximum,
    ARRAY_TO_STRING(
    ARRAY_AGG(
    CASE WHEN value > q3 + ((q3-q1) * 1.5)
    THEN value::VARCHAR ELSE NULL END
    ), ',') AS upper_outliers
    FROM quartiles
    GROUP BY 1
    ORDER BY 1