Sanjeev Chakravarty sanjeevbadgeville

curl -L https://github.com/docker/machine/releases/download/v0.8.2/docker-machine-`uname -s`-`uname -m` >/usr/local/bin/docker-machine && \
chmod +x /usr/local/bin/docker-machine
docker-machine version


docker-machine ls
docker-machine create --driver virtualbox default
docker-machine ls
docker-machine env default

The below steps all assume you have installed Docker. I used the Kitematic tool for OSX, and it worked great. My local container VM IP is 192.168.99.100, replace that in the commands with your local IP!

Let's Set up Zeppelin

I am using this Docker image https://github.com/dylanmei/docker-zeppelin to fire up Zeppelin and Spark. Note, it's slow cause there is so many processes (Spark Master, Spark Worker, Zeppelin) to start!
```
docker run -d --name zeppelin -p 8080:8080 dylanmei/zeppelin
```

Typing vagrant from the command line will display a list of all available commands.

Be sure that you are in the same directory as the Vagrantfile when running these commands!

Common Vagrant Commands

vagrant up -- starts vagrant environment (also provisions only on the FIRST vagrant up)
vagrant status -- outputs status of the vagrant machine
vagrant halt -- stops the vagrant machine
vagrant reload -- restarts vagrant machine, loads new Vagrantfile configuration
vagrant provision -- forces reprovisioning of the vagrant machine

	###### development tools
	sudo apt-get install build-essential python-dev git nodejs-legacy npm gnome-tweak-tool openjdk-8-jdk

	### Python packages
	sudo apt-get install python-pip python-virtualenv python-numpy python-matplotlib

	### pip packages

	pip install django flask django-widget-tweaks django-ckeditor beautifulsoup4 requests classifier SymPy ipython

	-- Query request times over time by user
	select distinct TheDay, user_name
	, (min_dat / 1000) as min_sec, (max_dat / 1000) as max_sec
	, (avg_dat / 1000) as avg_sec, (median_dat / 1000) as median_sec
	, query_cnt
	from (
	select DATE(end_timestamp::timestamp) as TheDay, user_name
	, min(request_duration_ms) over(partition by DATE(end_timestamp::timestamp), user_name ) min_dat
	, max(request_duration_ms) over(partition by DATE(end_timestamp::timestamp), user_name ) max_dat
	, avg(request_duration_ms) over(partition by DATE(end_timestamp::timestamp), user_name ) avg_dat

	source("/Users/sanjeev/workspaces/code/work/analytics/Analysis/R/common-start.R")

	################################################### drupal Site
	#drupal_plyrs_last30days <- read.xlsx("/Users/sanjeev/workspaces/code/work/analytics/Analysis/data/badgeville/drupal/MayPlayers.xlsx", sheetIndex = 1)
	#drupal_plyrs_last30days[is.na(drupal_plyrs_last30days)] <- 0

	needs(readr)
	needs(corrgram)
	# Resolve invalid multibyte string at '<ff><fe>D'
	drupal_plyrs_last30days <- read.delim2("/Users/sanjeev/Downloads/Recognition_-_Player_Activity_Details_in_last_30_d_crosstab.csv", dec=".", fileEncoding="UCS-2LE")

	# see github repos & package documentation
	# - http://github.com/apache/spark/tree/master/R
	# - http://spark.apache.org/docs/latest/api/R/

	# install the SparkR package
	devtools::install_github("apache/spark", ref="master", subdir="R/pkg")

	# load the SparkR & ggplot2 packages
	library('SparkR')
	library('ggplot2')

	/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/uninstall)"
	/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"

	# set up some taps and update brew
	brew tap homebrew/science # a lot of cool formulae for scientific tools
	brew tap homebrew/python # numpy, scipy, matplotlib, ...
	brew update && brew upgrade

	# install a brewed python
	brew install python

	Objective: Migrate content from data center to AWS and upgrade Tableau server from 9.1.3 to 9.2

	1. Remove Unneeded Files
	# to clean up all possible files and database entries, you should run tabadmin cleanup twice: once when Tableau Server is
	# running, and once when it is stopped.
	tabadmin cleanup
	tabadmin stop
	tabadmin cleanup
	tabadmin start

	# preliminaires
	library("ggplot2")
	library("zoo")
	set.seed(111)

	# generate plot of survival curve
	x <- sort(dexp(seq(0, 1, 0.01)), decreasing = TRUE)
	ggplot(data.frame(x = c(0, 5)), aes(x)) + stat_function(fun = dexp, args = list(rate = 1)) + scale_x_continuous(labels=c(expression(t["0"], t["1"], t["2"], t["3"], t["4"], t["5"]))) + labs(x = "Time", y = expression(y = P(T > t["i"])), title = "Survival Function")

	# simulate subscription data