Created
December 29, 2015 19:28
-
-
Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.
Revisions
-
Barrett Clark created this gist
Dec 29, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,54 @@ WITH raw_data AS ( SELECT jurisdictions AS county, median_value AS value FROM maryland_residential_sales_figures WHERE median_value > 999 ), ntiles AS ( SELECT county, value, ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number, COUNT(*) OVER (PARTITION BY county ) AS total, NTILE(2) OVER (PARTITION BY county ORDER BY value) AS bitile FROM raw_data ), medians AS ( SELECT county, total, AVG(CASE WHEN total % 2 > 0 THEN MAX(value) FILTER(WHERE bitile=1) ELSE (MAX(value) FILTER(WHERE bitile=1) + MIN(value) FILTER(WHERE bitile=2)) / 2.0 END ) OVER (PARTITION BY county) as median FROM ntiles GROUP BY 1, 2 ), quartiles AS ( SELECT ntiles.county, MEDIAN(value) FILTER(WHERE value < median) as q1, AVG(median) AS median, MEDIAN(value) FILTER(WHERE value > median) as q3 FROM ntiles JOIN medians on medians.county = ntiles.county GROUP BY 1 ORDER BY 1 ) SELECT quartiles.county, ARRAY_TO_STRING( ARRAY_AGG( CASE WHEN value < q1 - ((q3-q1) * 1.5) THEN value::VARCHAR ELSE NULL END ), ',') AS lower_outliers, MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5) THEN value ELSE NULL END ) AS minimum, quartiles.q1, quartiles.median, quartiles.q3, MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5) THEN value ELSE NULL END ) AS maximum, ARRAY_TO_STRING( ARRAY_AGG( CASE WHEN value > q3 + ((q3-q1) * 1.5) THEN value::VARCHAR ELSE NULL END ), ',') AS upper_outliers FROM quartiles JOIN raw_data on quartiles.county = raw_data.county GROUP BY quartiles.county, quartiles.q1, quartiles.median, quartiles.q3 ORDER BY quartiles.county This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,27 @@ CREATE OR REPLACE FUNCTION _final_median(anyarray) RETURNS float8 AS $$ WITH q AS ( SELECT val FROM unnest($1) val WHERE VAL IS NOT NULL ORDER BY 1 ), cnt AS ( SELECT COUNT(*) AS c FROM q ) SELECT AVG(val)::float8 FROM ( SELECT val FROM q LIMIT 2 - MOD((SELECT c FROM cnt), 2) OFFSET GREATEST(CEIL((SELECT c FROM cnt) / 2.0) - 1,0) ) q2; $$ LANGUAGE SQL IMMUTABLE; CREATE AGGREGATE median(anyelement) ( SFUNC=array_append, STYPE=anyarray, FINALFUNC=_final_median, INITCOND='{}' ); This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,49 @@ -- https://blog.modeanalytics.com/how-to-make-box-and-whisker-plot-sql/ WITH raw_data AS ( SELECT jurisdictions AS county, median_value AS value FROM maryland_residential_sales_figures WHERE median_value > 999 ), details AS ( SELECT county, value, ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number, COUNT(*) OVER (PARTITION BY county) AS total FROM raw_data ), quartiles AS ( -- NOTE: q1 and q3 could potentially include the median if it's repeated in the data SELECT county, value, AVG(CASE WHEN row_number BETWEEN FLOOR(total/2.0)/2.0 AND FLOOR(total/2.0)/2.0 + 1 THEN value ELSE NULL END ) OVER (PARTITION BY county) as q1, AVG(CASE WHEN row_number BETWEEN total/2.0 AND total/2.0 + 1 THEN value ELSE NULL END ) OVER (PARTITION BY county) AS median, AVG(CASE WHEN row_number BETWEEN CEIL(total/2.0) + FLOOR(total/2.0)/2.0 AND CEIL(total/2.0) + FLOOR(total/2.0)/2.0 + 1 THEN value/1.0 ELSE NULL END ) OVER (PARTITION BY county) AS q3 FROM details ) SELECT county, ARRAY_TO_STRING( ARRAY_AGG( CASE WHEN value < q1 - ((q3-q1) * 1.5) THEN value::VARCHAR ELSE NULL END ), ',') AS lower_outliers, MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5) THEN value ELSE NULL END ) AS minimum, AVG(q1) AS q1, AVG(median) AS median, AVG(q3) AS q3, MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5) THEN value ELSE NULL END ) AS maximum, ARRAY_TO_STRING( ARRAY_AGG( CASE WHEN value > q3 + ((q3-q1) * 1.5) THEN value::VARCHAR ELSE NULL END ), ',') AS upper_outliers FROM quartiles GROUP BY 1 ORDER BY 1