# Step 1. seaflow <- read.csv('seaflow_21min.csv', header=T) summary(seaflow) # Step 2. # a <- sample(dim(seaflow)[1], dim(seaflow)[1]*0.8) # trainseaflow <- seaflow[a,] # b <- 1:dim(seaflow)[1] # test.df <- seaflow[setdiff(b, a),] library(caret) data.part <- createDataPartition(seaflow$pop, times=2, p=.5) train.df <- seaflow[ data.part$Resample1, ] test.df <- seaflow[ data.part$Resample2, ] summary(train.df) # Step 3. library(ggplot2) qplot(x=chl_small, y=pe, data=seaflow, color=pop) # ggplot(data=train.df, aes(x=chl_small, y=pe, color=pop)) + geom_point() # Step 4. library(rpart) fol <- formula(pop ~ fsc_small+fsc_perp+fsc_big+pe+chl_big+chl_small) model <- rpart(fol, method='class', data=train.df) print(model) # Step 5. popfit <- predict(model, test.df) k <- apply(popfit, 1, function(x) max(which(x == max(x, na.rm = TRUE)))) popfitname <- colnames(popfit)[k] accuracy <- sum(popfitname==test.df[,'pop']) / length(k) # Step 6. library(randomForest) model <- randomForest(fol, data=train.df) print(model) popfit <- predict(model, test.df) accuracy <- sum(popfit==test.df[,'pop']) / length(k) imp_randomforest <- importance(model) library(e1071) model <- svm(fol, data=train.df) popfit <- predict(model, test.df) accuracy <- sum(popfit==test.df[,'pop']) / length(k) svmct <- table(pred=popfit, true=test.df$pop) newseaflow <- seaflow[seaflow['file_id']!=208,] newa <- sample(dim(newseaflow)[1], dim(newseaflow)[1]*0.5) trainnewseaflow <- newseaflow[newa,] newb <- 1:dim(newseaflow)[1] testnewseaflow <- newseaflow[setdiff(newb, newa),] library(e1071) fol <- formula(pop ~ fsc_small+fsc_perp+fsc_big+pe+chl_big+chl_small) model <- svm(fol, data=trainnewseaflow) popfit <- predict(model, testnewseaflow) accuracy <- sum(popfit==testnewseaflow[,'pop']) / length(popfit) svmct <- table(pred=popfit, true=testnewseaflow$pop)