daroczig · November 19, 2018 09:07 · Nov 19, 2018
diff --git a/BCE-MDDA-2018.R b/BCE-MDDA-2018.R
@@ -0,0 +1,276 @@
+## #############################################################################
+## PCA demo on image processing
+## #############################################################################
+
+download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw«
+
+library(jpeg)
+img <- readJPEG('image.jpg')
+str(img)
+
+dim(img)
+h <- dim(img)[1]
+w <- dim(img)[2]
+
+img1d <- matrix(img, h * w)
+pca <- prcomp(img1d)
+pca
+summary(pca)
+
+image(matrix(pca$x[, 1], h))
+image(matrix(pca$x[, 2], h))
+image(matrix(pca$x[, 2], h), col = gray.colors(100))
+
+image(matrix(pca$x[, 3], h))
+
+## #############################################################################
+## MDS
+## #############################################################################
+
+download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw«
+
+library(readxl)
+cities <- read_excel('cities .xls')
+
+## get rid of 1st column and last row (metadata)
+cities <- cities[, -1]
+cities <- cities[-nrow(cities), ]
+
+mds <- cmdscale(as.dist(cities))
+
+plot(mds)
+text(mds[, 1], mds[, 2], tail(names(cities), -1))
+
+mds <- -mds
+plot(mds)
+text(mds[, 1], mds[, 2], tail(names(cities), -1))
+
+mds <- cmdscale(dist(mtcars))
+plot(mds)
+text(mds[, 1], mds[, 2], rownames(mtcars))
+
+mds <- as.data.frame(mds)
+library(ggplot2) # start with geom_point
+ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text()
+
+library(ggrepel)
+ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel()
+
+## #############################################################################
+## feature selection
+## #############################################################################
+
+str(iris)
+plot(iris$Sepal.Length, iris$Sepal.Width)
+
+fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
+fit
+summary(fit)
+
+plot(iris$Sepal.Length, iris$Sepal.Width)
+abline(fit, col = 'red', lwd = 5)
+
+## now try
+plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
+summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) +
+  geom_point() + geom_smooth(method = 'lm')
+
+ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
+  geom_point(aes(col = Species)) +
+  geom_smooth(aes(col = Species), method = 'lm') +
+  geom_smooth(method = 'lm', col = 'black') + theme_bw()
+
+## #############################################################################
+## PCA demo to show the steps in hierarchical clustering
+## #############################################################################
+
+prcomp(iris[, 1:4])
+PC <- prcomp(iris[, 1:4])$x
+dm <- dist(iris[, 1:4])
+hc <- hclust(dm)
+plot(hc)
+
+for (i in 2:8) {
+  plot(hc)
+  rect.hclust(hc, k = i, border = 'red')
+  Sys.sleep(1)
+}
+
+library(animation)
+ani.options(interval = 1)
+saveGIF({
+  for (i in 2:8) {
+    plot(hc)
+    rect.hclust(hc, k = i, border = 'red')
+  }
+})
+
+library(animation)
+ani.options(interval = 1)
+saveGIF({
+  for (i in 1:8) {
+    plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5)
+  }
+})
+
+## #############################################################################
+## basic decision tress
+## #############################################################################
+
+rm(iris)
+
+set.seed(100)
+mysample <- sample(1:150, 100)
+str(mysample)
+
+train <- iris[mysample, ]
+test  <- iris[setdiff(1:150, mysample), ]
+
+library(rpart)
+ct <- rpart(Species ~ ., data = train)
+summary(ct)
+
+plot(ct); text(ct)
+
+library(partykit)
+plot(as.party(ct))
+
+predict(ct, newdata = test)
+predict(ct, newdata = test, type = 'class')
+
+## confusion matrix
+table(test$Species,
+      predict(ct, newdata = test,
+              type = 'class'))
+
+ct <- rpart(Species ~ ., data = train,
+            minsplit = 1, cp = 0.001)
+## ?rpart.control
+plot(ct); text(ct)
+plot(as.party(ct))
+
+table(test$Species,
+      predict(ct, newdata = test,
+              type = 'class'))
+
+## back to slides on high level overview on bagging, random forest, gbm etc
+
+## #############################################################################
+## H2O
+## #############################################################################
+
+## start and connect to H2O
+library(h2o)
+h2o.init()
+## in a browser check http://localhost:54321
+
+rpart()
+h2o.randomForest()
+
+## demo data
+library(hflights)
+rm(hflights)
+str(hflights)
+
+## copy to H2O
+hflights.hex <- as.h2o(hflights, 'hflights')
+
+str(hflights.hex)
+str(head(hflights.hex))
+head(hflights.hex, 3)
+summary(hflights.hex)
+## check flight number: numeric?
+## check on H2O web interface as well (enum vs factor)
+
+## convert numeric to factor/enum
+hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
+summary(hflights.hex)
+
+## boring
+for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
+    hflights.hex[, v] <- as.factor(hflights.hex[, v])
+}
+
+summary(hflights.hex)
+
+## feature engineering: departure time? is it OK? hour of the day?
+## redo everything... just use the R script
+library(data.table)
+hflights <- data.table(hflights)
+hflights[, hour := substr(DepTime, 1, 2)]
+hflights[, .N, by = hour]
+
+hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
+hflights[, .N, by = hour][order(hour)]
+hflights[hour == '']
+
+hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
+hflights[, .N, by = hour]
+
+hflights[is.na(hour)]
+hflights[, .N, by = .(is.na(hour), Cancelled == 1)]
+
+## drop columns
+hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
+                         UniqueCarrier, FlightNum, TailNum, Distance,
+                         Cancelled = factor(Cancelled))]
+
+
+## re-upload to H2O
+h2o.ls()
+hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')
+
+## split the data
+h2o.splitFrame(data = hflights.hex , ratios = 0.75,
+               destination_frames = paste0('hflights', 0:1))
+h2o.ls()
+
+## build the first model
+hflights.rf <- h2o.randomForest(
+    model_id = 'hflights_rf',
+    x = setdiff(names(hflights), 'Cancelled'),
+    y = 'Cancelled',
+    training_frame = 'hflights0',
+    validation_frame = 'hflights1')
+hflights.rf
+
+## more trees
+hflights.rf <- h2o.randomForest(
+    model_id = 'hflights_rf500',
+    x = setdiff(names(hflights), 'Cancelled'),
+    y = 'Cancelled',
+    training_frame = 'hflights0',
+    validation_frame = 'hflights1', ntrees = 500)
+
+## change to UI and check ROC curve & AUC
+
+## GBM
+hflights.gbm <- h2o.gbm(
+    x = setdiff(names(hflights), 'Cancelled'),
+    y = 'Cancelled',
+    training_frame = 'hflights0',
+    validation_frame = 'hflights1',
+    model_id = 'hflights_gbm')
+
+## more trees should help, again !!!
+hflights.gbm <- h2o.gbm(
+    x = setdiff(names(hflights), 'Cancelled'),
+    y = 'Cancelled',
+    training_frame = 'hflights0',
+    validation_frame = 'hflights1',
+    model_id = 'hflights_gbm2', ntrees = 550)
+
+## but no: although higher training AUC, lower validation AUC => overfit
+
+## more trees should help, again !!!
+hflights.gbm <- h2o.gbm(
+    x = setdiff(names(hflights), 'Cancelled'),
+    y = 'Cancelled',
+    training_frame = 'hflights_part0',
+    validation_frame = 'hflights_part1',
+    model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)
+
+## bye
+h2o.shutdown()
No results found