Created
November 19, 2018 09:07
-
-
Save daroczig/f9a0cba2691f99617d0bfd20578bf8e3 to your computer and use it in GitHub Desktop.
Revisions
-
daroczig created this gist
Nov 19, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,276 @@ ## ############################################################################# ## PCA demo on image processing ## ############################################################################# download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw« library(jpeg) img <- readJPEG('image.jpg') str(img) dim(img) h <- dim(img)[1] w <- dim(img)[2] img1d <- matrix(img, h * w) pca <- prcomp(img1d) pca summary(pca) image(matrix(pca$x[, 1], h)) image(matrix(pca$x[, 2], h)) image(matrix(pca$x[, 2], h), col = gray.colors(100)) image(matrix(pca$x[, 3], h)) ## ############################################################################# ## MDS ## ############################################################################# download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw« library(readxl) cities <- read_excel('cities .xls') ## get rid of 1st column and last row (metadata) cities <- cities[, -1] cities <- cities[-nrow(cities), ] mds <- cmdscale(as.dist(cities)) plot(mds) text(mds[, 1], mds[, 2], tail(names(cities), -1)) mds <- -mds plot(mds) text(mds[, 1], mds[, 2], tail(names(cities), -1)) mds <- cmdscale(dist(mtcars)) plot(mds) text(mds[, 1], mds[, 2], rownames(mtcars)) mds <- as.data.frame(mds) library(ggplot2) # start with geom_point ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text() library(ggrepel) ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel() ## ############################################################################# ## feature selection ## ############################################################################# str(iris) plot(iris$Sepal.Length, iris$Sepal.Width) fit <- lm(Sepal.Width ~ Sepal.Length, data = iris) fit summary(fit) plot(iris$Sepal.Length, iris$Sepal.Width) abline(fit, col = 'red', lwd = 5) ## now try plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species) summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris)) ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) + geom_point() + geom_smooth(method = 'lm') ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point(aes(col = Species)) + geom_smooth(aes(col = Species), method = 'lm') + geom_smooth(method = 'lm', col = 'black') + theme_bw() ## ############################################################################# ## PCA demo to show the steps in hierarchical clustering ## ############################################################################# prcomp(iris[, 1:4]) PC <- prcomp(iris[, 1:4])$x dm <- dist(iris[, 1:4]) hc <- hclust(dm) plot(hc) for (i in 2:8) { plot(hc) rect.hclust(hc, k = i, border = 'red') Sys.sleep(1) } library(animation) ani.options(interval = 1) saveGIF({ for (i in 2:8) { plot(hc) rect.hclust(hc, k = i, border = 'red') } }) library(animation) ani.options(interval = 1) saveGIF({ for (i in 1:8) { plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5) } }) ## ############################################################################# ## basic decision tress ## ############################################################################# rm(iris) set.seed(100) mysample <- sample(1:150, 100) str(mysample) train <- iris[mysample, ] test <- iris[setdiff(1:150, mysample), ] library(rpart) ct <- rpart(Species ~ ., data = train) summary(ct) plot(ct); text(ct) library(partykit) plot(as.party(ct)) predict(ct, newdata = test) predict(ct, newdata = test, type = 'class') ## confusion matrix table(test$Species, predict(ct, newdata = test, type = 'class')) ct <- rpart(Species ~ ., data = train, minsplit = 1, cp = 0.001) ## ?rpart.control plot(ct); text(ct) plot(as.party(ct)) table(test$Species, predict(ct, newdata = test, type = 'class')) ## back to slides on high level overview on bagging, random forest, gbm etc ## ############################################################################# ## H2O ## ############################################################################# ## start and connect to H2O library(h2o) h2o.init() ## in a browser check http://localhost:54321 rpart() h2o.randomForest() ## demo data library(hflights) rm(hflights) str(hflights) ## copy to H2O hflights.hex <- as.h2o(hflights, 'hflights') str(hflights.hex) str(head(hflights.hex)) head(hflights.hex, 3) summary(hflights.hex) ## check flight number: numeric? ## check on H2O web interface as well (enum vs factor) ## convert numeric to factor/enum hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum']) summary(hflights.hex) ## boring for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) { hflights.hex[, v] <- as.factor(hflights.hex[, v]) } summary(hflights.hex) ## feature engineering: departure time? is it OK? hour of the day? ## redo everything... just use the R script library(data.table) hflights <- data.table(hflights) hflights[, hour := substr(DepTime, 1, 2)] hflights[, .N, by = hour] hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)] hflights[, .N, by = hour][order(hour)] hflights[hour == ''] hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))] hflights[, .N, by = hour] hflights[is.na(hour)] hflights[, .N, by = .(is.na(hour), Cancelled == 1)] ## drop columns hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin, UniqueCarrier, FlightNum, TailNum, Distance, Cancelled = factor(Cancelled))] ## re-upload to H2O h2o.ls() hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights') ## split the data h2o.splitFrame(data = hflights.hex , ratios = 0.75, destination_frames = paste0('hflights', 0:1)) h2o.ls() ## build the first model hflights.rf <- h2o.randomForest( model_id = 'hflights_rf', x = setdiff(names(hflights), 'Cancelled'), y = 'Cancelled', training_frame = 'hflights0', validation_frame = 'hflights1') hflights.rf ## more trees hflights.rf <- h2o.randomForest( model_id = 'hflights_rf500', x = setdiff(names(hflights), 'Cancelled'), y = 'Cancelled', training_frame = 'hflights0', validation_frame = 'hflights1', ntrees = 500) ## change to UI and check ROC curve & AUC ## GBM hflights.gbm <- h2o.gbm( x = setdiff(names(hflights), 'Cancelled'), y = 'Cancelled', training_frame = 'hflights0', validation_frame = 'hflights1', model_id = 'hflights_gbm') ## more trees should help, again !!! hflights.gbm <- h2o.gbm( x = setdiff(names(hflights), 'Cancelled'), y = 'Cancelled', training_frame = 'hflights0', validation_frame = 'hflights1', model_id = 'hflights_gbm2', ntrees = 550) ## but no: although higher training AUC, lower validation AUC => overfit ## more trees should help, again !!! hflights.gbm <- h2o.gbm( x = setdiff(names(hflights), 'Cancelled'), y = 'Cancelled', training_frame = 'hflights_part0', validation_frame = 'hflights_part1', model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01) ## bye h2o.shutdown()