Skip to content

Instantly share code, notes, and snippets.

@daroczig
Created November 19, 2018 09:07
Show Gist options
  • Select an option

  • Save daroczig/f9a0cba2691f99617d0bfd20578bf8e3 to your computer and use it in GitHub Desktop.

Select an option

Save daroczig/f9a0cba2691f99617d0bfd20578bf8e3 to your computer and use it in GitHub Desktop.

Revisions

  1. daroczig created this gist Nov 19, 2018.
    276 changes: 276 additions & 0 deletions BCE-MDDA-2018.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,276 @@
    ## #############################################################################
    ## PCA demo on image processing
    ## #############################################################################

    download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw«

    library(jpeg)
    img <- readJPEG('image.jpg')
    str(img)

    dim(img)
    h <- dim(img)[1]
    w <- dim(img)[2]

    img1d <- matrix(img, h * w)
    pca <- prcomp(img1d)
    pca
    summary(pca)

    image(matrix(pca$x[, 1], h))
    image(matrix(pca$x[, 2], h))
    image(matrix(pca$x[, 2], h), col = gray.colors(100))

    image(matrix(pca$x[, 3], h))

    ## #############################################################################
    ## MDS
    ## #############################################################################

    download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw«

    library(readxl)
    cities <- read_excel('cities .xls')

    ## get rid of 1st column and last row (metadata)
    cities <- cities[, -1]
    cities <- cities[-nrow(cities), ]

    mds <- cmdscale(as.dist(cities))

    plot(mds)
    text(mds[, 1], mds[, 2], tail(names(cities), -1))

    mds <- -mds
    plot(mds)
    text(mds[, 1], mds[, 2], tail(names(cities), -1))

    mds <- cmdscale(dist(mtcars))
    plot(mds)
    text(mds[, 1], mds[, 2], rownames(mtcars))

    mds <- as.data.frame(mds)
    library(ggplot2) # start with geom_point
    ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text()

    library(ggrepel)
    ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel()

    ## #############################################################################
    ## feature selection
    ## #############################################################################

    str(iris)
    plot(iris$Sepal.Length, iris$Sepal.Width)

    fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
    fit
    summary(fit)

    plot(iris$Sepal.Length, iris$Sepal.Width)
    abline(fit, col = 'red', lwd = 5)

    ## now try
    plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
    summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))

    ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) +
    geom_point() + geom_smooth(method = 'lm')

    ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
    geom_point(aes(col = Species)) +
    geom_smooth(aes(col = Species), method = 'lm') +
    geom_smooth(method = 'lm', col = 'black') + theme_bw()

    ## #############################################################################
    ## PCA demo to show the steps in hierarchical clustering
    ## #############################################################################

    prcomp(iris[, 1:4])
    PC <- prcomp(iris[, 1:4])$x
    dm <- dist(iris[, 1:4])
    hc <- hclust(dm)
    plot(hc)

    for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i, border = 'red')
    Sys.sleep(1)
    }

    library(animation)
    ani.options(interval = 1)
    saveGIF({
    for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i, border = 'red')
    }
    })

    library(animation)
    ani.options(interval = 1)
    saveGIF({
    for (i in 1:8) {
    plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5)
    }
    })

    ## #############################################################################
    ## basic decision tress
    ## #############################################################################

    rm(iris)

    set.seed(100)
    mysample <- sample(1:150, 100)
    str(mysample)

    train <- iris[mysample, ]
    test <- iris[setdiff(1:150, mysample), ]

    library(rpart)
    ct <- rpart(Species ~ ., data = train)
    summary(ct)

    plot(ct); text(ct)

    library(partykit)
    plot(as.party(ct))

    predict(ct, newdata = test)
    predict(ct, newdata = test, type = 'class')

    ## confusion matrix
    table(test$Species,
    predict(ct, newdata = test,
    type = 'class'))

    ct <- rpart(Species ~ ., data = train,
    minsplit = 1, cp = 0.001)
    ## ?rpart.control
    plot(ct); text(ct)
    plot(as.party(ct))

    table(test$Species,
    predict(ct, newdata = test,
    type = 'class'))

    ## back to slides on high level overview on bagging, random forest, gbm etc

    ## #############################################################################
    ## H2O
    ## #############################################################################

    ## start and connect to H2O
    library(h2o)
    h2o.init()
    ## in a browser check http://localhost:54321

    rpart()
    h2o.randomForest()

    ## demo data
    library(hflights)
    rm(hflights)
    str(hflights)

    ## copy to H2O
    hflights.hex <- as.h2o(hflights, 'hflights')

    str(hflights.hex)
    str(head(hflights.hex))
    head(hflights.hex, 3)
    summary(hflights.hex)
    ## check flight number: numeric?
    ## check on H2O web interface as well (enum vs factor)

    ## convert numeric to factor/enum
    hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
    summary(hflights.hex)

    ## boring
    for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
    hflights.hex[, v] <- as.factor(hflights.hex[, v])
    }

    summary(hflights.hex)

    ## feature engineering: departure time? is it OK? hour of the day?
    ## redo everything... just use the R script
    library(data.table)
    hflights <- data.table(hflights)
    hflights[, hour := substr(DepTime, 1, 2)]
    hflights[, .N, by = hour]

    hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
    hflights[, .N, by = hour][order(hour)]
    hflights[hour == '']

    hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
    hflights[, .N, by = hour]

    hflights[is.na(hour)]
    hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

    ## drop columns
    hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
    UniqueCarrier, FlightNum, TailNum, Distance,
    Cancelled = factor(Cancelled))]


    ## re-upload to H2O
    h2o.ls()
    hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

    ## split the data
    h2o.splitFrame(data = hflights.hex , ratios = 0.75,
    destination_frames = paste0('hflights', 0:1))
    h2o.ls()

    ## build the first model
    hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1')
    hflights.rf

    ## more trees
    hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf500',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1', ntrees = 500)

    ## change to UI and check ROC curve & AUC

    ## GBM
    hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm')

    ## more trees should help, again !!!
    hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm2', ntrees = 550)

    ## but no: although higher training AUC, lower validation AUC => overfit

    ## more trees should help, again !!!
    hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights_part0',
    validation_frame = 'hflights_part1',
    model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

    ## bye
    h2o.shutdown()