Created
          October 10, 2015 07:05 
        
      - 
      
 - 
        
Save ledell/d5b51fd7bb2548636d87 to your computer and use it in GitHub Desktop.  
Revisions
- 
        
ledell created this gist
Oct 10, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,129 @@ ### Lending Club example using cleaned up dataset & h2o.ensemble ### library(h2o) h2o.init(nthreads = -1, max_mem_size = "8G") loan_csv <- "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv" data <- h2o.importFile(loan_csv) # 163994 x 15 data$bad_loan <- as.factor(data$bad_loan) rand <- h2o.runif(data, seed = 1) train <- data[rand$rnd <= 0.8, ] valid <- data[rand$rnd > 0.8, ] y <- "bad_loan" x <- setdiff(names(data), c(y, "int_rate")) library(h2oEnsemble) # Specify the base learner library & the metalearner learner <- c("h2o.glm.wrapper", "h2o.randomForest.wrapper", "h2o.gbm.wrapper", "h2o.deeplearning.wrapper") metalearner <- "h2o.deeplearning.wrapper" family <- "binomial" # Train the ensemble using 5-fold CV to generate level-one data # More CV folds will take longer to train, but should increase performance fit <- h2o.ensemble(x = x, y = y, training_frame = train, validation_frame = NULL, family = family, learner = learner, metalearner = metalearner, cvControl = list(V = 5, shuffle = TRUE)) # Generate predictions on the test set pred <- predict.h2o.ensemble(fit, valid) predictions <- as.data.frame(pred$pred)[,3] #third column, p1 is P(Y==1) labels <- as.data.frame(valid[,c(y)])[,1] # Ensemble test AUC cvAUC::AUC(predictions = predictions , labels = labels) # 0.6802715 # Base learner test AUC (for comparison) learner <- names(fit$basefits) L <- length(learner) auc <- sapply(seq(L), function(l) cvAUC::AUC(predictions = as.data.frame(pred$basepred)[,l], labels = labels)) data.frame(learner, auc) #learner auc #1 h2o.glm.wrapper 0.6721662 #2 h2o.randomForest.wrapper 0.6673966 #3 h2o.gbm.wrapper 0.6737319 #4 h2o.deeplearning.wrapper 0.6696115 # Now let's try again with a more extensive set of base learners # Here is an example of how to generate a custom learner wrappers: h2o.glm.1 <- function(..., alpha = 0.0) h2o.glm.wrapper(..., alpha = alpha) h2o.glm.2 <- function(..., alpha = 0.5) h2o.glm.wrapper(..., alpha = alpha) h2o.glm.3 <- function(..., alpha = 1.0) h2o.glm.wrapper(..., alpha = alpha) h2o.randomForest.1 <- function(..., ntrees = 200, nbins = 50, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) h2o.randomForest.2 <- function(..., ntrees = 200, sample_rate = 0.75, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, sample_rate = sample_rate, seed = seed) h2o.randomForest.3 <- function(..., ntrees = 200, sample_rate = 0.85, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, sample_rate = sample_rate, seed = seed) h2o.gbm.1 <- function(..., ntrees = 100, nbins = 100, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) h2o.gbm.2 <- function(..., ntrees = 200, nbins = 50, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) h2o.gbm.3 <- function(..., ntrees = 100, max_depth = 10, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, max_depth = max_depth, seed = seed) h2o.gbm.4 <- function(..., ntrees = 100, col_sample_rate = 0.8, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) h2o.gbm.5 <- function(..., ntrees = 200, col_sample_rate = 0.8, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) h2o.gbm.6 <- function(..., ntrees = 200, col_sample_rate = 0.7, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) h2o.deeplearning.1 <- function(..., hidden = c(500,500), activation = "Rectifier", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) h2o.deeplearning.2 <- function(..., hidden = c(200,200,200), activation = "Tanh", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) h2o.deeplearning.3 <- function(..., hidden = c(500,500), activation = "RectifierWithDropout", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) learner <- c("h2o.glm.1", "h2o.glm.2", "h2o.glm.3", "h2o.randomForest.1", "h2o.randomForest.2", "h2o.randomForest.3", "h2o.gbm.1", "h2o.gbm.2", "h2o.gbm.3", "h2o.gbm.4", "h2o.gbm.5", "h2o.gbm.6", "h2o.deeplearning.1", "h2o.deeplearning.2", "h2o.deeplearning.3") metalearner <- "h2o.deeplearning.wrapper" family <- "binomial" # Train the ensemble using 5-fold CV to generate level-one data # More CV folds will take longer to train, but should increase performance fit <- h2o.ensemble(x = x, y = y, training_frame = train, validation_frame = NULL, family = family, learner = learner, metalearner = metalearner, cvControl = list(V = 5, shuffle = TRUE)) # Generate predictions on the test set pred <- predict.h2o.ensemble(fit, valid) predictions <- as.data.frame(pred$pred)[,3] #third column, p1 is P(Y==1) labels <- as.data.frame(valid[,c(y)])[,1] # Ensemble test AUC cvAUC::AUC(predictions = predictions , labels = labels) # 0.6832164 # Base learner test AUC (for comparison) L <- length(learner) auc <- sapply(seq(L), function(l) cvAUC::AUC(predictions = as.data.frame(pred$basepred)[,l], labels = labels)) data.frame(learner, auc) # learner auc # 1 h2o.glm.1 0.6742993 # 2 h2o.glm.2 0.6743306 # 3 h2o.glm.3 0.6743672 # 4 h2o.randomForest.1 0.6702020 # 5 h2o.randomForest.2 0.6698333 # 6 h2o.randomForest.3 0.6698533 # 7 h2o.gbm.1 0.6801405 # 8 h2o.gbm.2 0.6785209 # 9 h2o.gbm.3 0.6699785 # 10 h2o.gbm.4 0.6805663 # 11 h2o.gbm.5 0.6790248 # 12 h2o.gbm.6 0.6790248 # 13 h2o.deeplearning.1 0.6772474 # 14 h2o.deeplearning.2 0.6696069 # 15 h2o.deeplearning.3 0.6802065