smalik’s gists

smalik / mad.sql

Created September 27, 2018 21:17

SQL Code for mean absolute deviation

	SELECT id, MEDIAN(ABS(value - med))
	FROM (
	SELECT id, value, MEDIAN(value) OVER(PARTITION BY id) AS med
	FROM mytable
	)
	GROUP BY
	id

smalik / hist_to_distr.py

Created May 10, 2018 17:03

Fit distribution to histogram

	from scipy import stats
	import numpy as np
	import matplotlib.pylab as plt

	# create some normal random noisy data
	ser = 50np.random.rand() np.random.normal(10, 10, 100) + 20

	# plot normed histogram
	plt.hist(ser, normed=True)

smalik / fit_weibull.py

Created May 10, 2018 17:00

Fit weibull distribution to data and estimate parameters

	from scipy.stats import exponweib
	from scipy.optimize import fmin
	import numpy as np

	# x is your data array
	# returns [shape, scale]

	def fitweibull(x):
	def optfun(theta):
	return -np.sum(np.log(exponweib.pdf(x, 1, theta[0], scale = theta[1], loc = 0)))

smalik / get_low_variance_columns.py

Created December 4, 2017 19:55

get_low_variance_columns

	import numpy as np
	import pandas as pd
	from sklearn.feature_selection import VarianceThreshold

	# Convenience function wraps the VarianceThreshold transformer but you can pass it a pandas dataframe and get one in return

	def get_low_variance_columns(dframe=None, columns=None,
	skip_columns=None, thresh=0.0,
	autoremove=False):
	"""

smalik / vertica_column_storage.sql

Created February 16, 2016 20:00

	SELECT anchor_table_schema,
	anchor_table_name,
	SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
	FROM v_monitor.column_storage
	GROUP BY anchor_table_schema,
	anchor_table_name
	ORDER BY SUM(used_bytes) DESC;

smalik / vertica_projection_storage.sql

Created February 16, 2016 19:59

	SELECT anchor_table_schema,
	anchor_table_name,
	SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
	FROM v_monitor.projection_storage
	GROUP BY anchor_table_schema,
	anchor_table_name
	ORDER BY SUM(used_bytes) DESC;

smalik / howMuchRam.R

Created January 14, 2016 15:28

	# Ripped off from http://stackoverflow.com/questions/21754319/rule-of-thumb-for-memory-size-of-datasets-in-r
	# original author Carlos Cinelli

	howMuchRAM <-function(ncol, nrow, cushion=3){
	#40 bytes per col
	colBytes <- ncol*40

	#8 bytes per cell
	cellBytes <- ncolnrow8

smalik / filesplits.sh

Created November 6, 2015 18:32

script to split large CSV file into smaller parts names in sequence

	#!/bin/bash
	fname=devices1518.csv
	HDR=$(head -1 $fname) # Pick up CSV header line to apply to each file
	split -l 1000000 $fname prt # Split the file into chunks of 1M lines each
	n=1
	for f in prt* # iterate over chunks
	do
	echo $HDR > Part${n} # Write out header to new files"
	cat $f >> Part${n} # Add in lines that were split
	rm $f

smalik / install_packages.R

Created October 27, 2015 16:59

	.libPaths(/<Data partition>/R/library)
	plist <- read.csv('~/Downloads/packages.csv')
	install.packages(as.character(plist[,1]))

smalik / R_object_sizer.R

Created October 27, 2015 14:14


	# Get memory allocation on an object by object basis. Loop over your namespace and print out memory used by all objects.
	for (itm in ls()) {
	print(formatC(c(itm, object.size(get(itm))), format="d", big.mark=",", width=30), quote=F)
	}

	# Print memory used collectively in namespace
	print(memory.profile())

Sulaiman Malik smalik