- 
      
- 
        Save angrycoffeemonster/20afafa7816a3613c75d to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #' Read xlsx files | |
| #' | |
| #' @param file The path to xlsx file | |
| #' @param keep_sheets A vector of sheet name | |
| #' @param header Whether include the head in the sheet | |
| #' @param empty_row Whether to remove the empty rows | |
| #' @export | |
| xlsxToR <- function(file, keep_sheets = NULL, header = TRUE, empty_row = TRUE) | |
| { | |
| suppressWarnings(file.remove(tempdir())) | |
| file.copy(file, tempdir()) | |
| new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file)) | |
| new_file_rename <- gsub("xlsx$", "zip", new_file) | |
| file.rename(new_file, new_file_rename) | |
| unzip(new_file_rename, exdir = tempdir()) | |
| # Get OS | |
| mac <- readLines(paste0(tempdir(), "/docProps/app.xml"), warn = FALSE) | |
| mac <- grep("Macintosh", mac) | |
| if (length(mac) > 0) | |
| { | |
| os_origin <- "1899-12-30" # documentation says should be "1904-01-01" | |
| } else | |
| { | |
| os_origin <- "1899-12-30" | |
| } | |
| # Get names of sheets | |
| sheet_names_str <- readLines(paste0(tempdir(), "/xl/workbook.xml"), warn = FALSE)[2] | |
| sheet_names_str <- gsub('.*<sheets>(.*)</sheets>.*', '\\1', sheet_names_str) | |
| sheet_names_str <- strsplit(sheet_names_str, '/>')[[1]] | |
| sheet_names <- NULL | |
| sheet_names$name <- gsub('.* name="(.*)"( sheetId.*)', '\\1', sheet_names_str) | |
| sheet_names$sheetId <- gsub('.* sheetId="(\\d+)" +.*', '\\1', sheet_names_str) | |
| sheet_names$id <- gsub('.* r:id="(.*)"$', '\\1', sheet_names_str) | |
| sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE) | |
| sheet_names$id <- gsub("\\D", "", sheet_names$id) | |
| if(!is.null(keep_sheets)) | |
| { | |
| sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,] | |
| } | |
| entries <- readLines(paste0(tempdir(), "/xl/sharedStrings.xml"), warn = FALSE)[2] | |
| entries <- gsub('^<sst .*">(<si>.*)</sst>$', '\\1', entries) | |
| entries <- strsplit(entries, '</si>')[[1]] | |
| entries <- gsub('^.*<t.*>(.+)</t>$', '\\1', entries) | |
| entries[grep('<t/>', entries)] <- NA | |
| names(entries) <- seq_along(entries) - 1 | |
| # Get column classes | |
| styles <- readLines(paste0(tempdir(), '/xl/styles.xml'), warn = FALSE)[2] | |
| numFmtId <- gsub('^.*<cellXfs count="\\d+">(.*)</cellXfs>.*$', '\\1', styles) | |
| numFmtId <- strsplit(numFmtId, '<xf')[[1]] | |
| numFmtId <- numFmtId[nchar(numFmtId) > 0] | |
| numFmtId <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmtId)) | |
| cell_style <- as.data.frame(list(id = seq(0, by = 1, along = numFmtId), | |
| numFmtId = numFmtId), stringsAsFactors = FALSE) | |
| # Custom style | |
| numFmt <- gsub('^.*<numFmts count="\\d+">(.*)</numFmts>.*$', '\\1', styles) | |
| if (length(numFmt) > 0) | |
| { | |
| numFmt <- strsplit(numFmt, '/><numFmt')[[1]] | |
| numFmt_cid <- as.numeric(gsub('.*numFmtId="(\\d+)".*', '\\1', numFmt)) | |
| cid_type <- rep(NA, length(numFmt_cid)) | |
| formatCode <- gsub('.*formatCode="(.*)".*', '\\1', numFmt) | |
| pos <- grep('y|m|d', formatCode) | |
| if (length(pos) > 0) | |
| { | |
| date_format <- formatCode[grep('y|m|d', formatCode)] | |
| pos <- grep('h', date_format) | |
| if (length(pos) > 0) | |
| { | |
| date_format <- date_format[-pos] | |
| } | |
| pos <- cell_style$numFmtId %in% numFmt_cid[formatCode %in% date_format] | |
| cell_style$numFmtId[pos] <- 14 | |
| } | |
| } | |
| worksheet_paths <- paste0(tempdir(), "/xl/worksheets/sheet", | |
| sheet_names$id, '.xml') | |
| worksheets <- as.list(NULL) | |
| for (i in seq(along = worksheet_paths)) | |
| { | |
| tryCatch({ | |
| sheet_data <- readLines(worksheet_paths[i], warn = FALSE)[2] | |
| sheet_data <- gsub('(.*<sheetData>)(.*)(</sheetData>.*)', '\\2', sheet_data) | |
| sheet_data <- strsplit(sheet_data, '</row>')[[1]] | |
| sheet_data <- sheet_data[grep('</c>', sheet_data)] | |
| if (length(sheet_data) == 0) | |
| { | |
| next | |
| } | |
| sheet_data <- strsplit(sheet_data, '</c>') | |
| sheet_data <- unlist(sheet_data) | |
| sheet_data <- gsub('(.*<row.*>)(<c.*)', '\\2', sheet_data) | |
| res <- NULL | |
| res$r <- gsub('.*r="(\\w+\\d+)".*', '\\1', sheet_data) | |
| res$v <- rep(NA, length(sheet_data)) | |
| pos <- grep('.*<v>(.*)</v>.*', sheet_data) | |
| res$v[pos] <- gsub('.*<v>(.*)</v>.*', '\\1', sheet_data[pos]) | |
| res$s <- rep(NA, length(sheet_data)) | |
| pos <- grep('.* s="(\\d+|\\w+)"( |>).*', sheet_data) | |
| res$s[pos] <- gsub('.* s="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos]) | |
| res$t <- rep(NA, length(sheet_data)) | |
| pos <- grep('.* t="(\\d+|\\w+)"( |>).*', sheet_data) | |
| res$t[pos] <- gsub('.* t="(\\d+|\\w+)"( |>).*', '\\1', sheet_data[pos]) | |
| res <- as.data.frame(res, stringsAsFactors = FALSE) | |
| res$sheet <- sheet_names[sheet_names$id == i, 'name'] | |
| entries_match <- entries[match(res$v, names(entries))] | |
| res$v[res$t == "s" & !is.na(res$t)] <- | |
| entries_match[res$t == "s"& !is.na(res$t)] | |
| res$cols <- match(gsub("\\d", "", res$r), LETTERS) | |
| res$rows <- as.numeric(gsub("\\D", "", res$r)) | |
| nrow <- max(res$rows) | |
| ncol <- max(res$cols) | |
| if (header) | |
| { | |
| nrow <- nrow - 1 | |
| } | |
| res_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol), | |
| stringsAsFactors = FALSE) | |
| style_df <- as.data.frame(matrix(rep(NA, ncol * nrow), ncol = ncol), | |
| stringsAsFactors = FALSE) | |
| if (header) | |
| { | |
| header_df <- res[res$rows == 1,] | |
| header_name <- paste0('V', seq(ncol)) | |
| header_name[header_df$cols] <- header_df$v | |
| names(res_df) <- header_name | |
| res <- res[res$rows != 1,] | |
| res$rows <- res$rows - 1 | |
| } | |
| if (nrow(res) > 0) | |
| { | |
| res_df[as.matrix(res[,c('rows', 'cols')])] <- res$v | |
| s <- as.numeric(res$s) | |
| s <- cell_style$numFmtId[match(s, cell_style$id)] | |
| style_df[as.matrix(res[,c('rows', 'cols')])] <- s | |
| } | |
| style_df <- sapply(style_df, function(x) | |
| { | |
| ifelse(length(unique(x[!is.na(x)])) == 1, unique(x[!is.na(x)]), NA) | |
| }) | |
| style_col <- rep('character', length(style_df)) | |
| style_col[style_df %in% 14:17] <- "date" | |
| style_col[style_df %in% c(18:21, 45:47)] <- "time" | |
| style_col[style_df %in% 22] <- "datetime" | |
| style_col[is.na(style_df) & !sapply(res_df, function(x) any(grepl("\\D", x)))] <- "numeric" | |
| number2date <- function(x) | |
| { | |
| if (class(x) == 'character') | |
| { | |
| pos <- !(is.na(x)) & !(x == 'NA') | |
| } else | |
| { | |
| stop('NOT implemented to check class of column date in xlsxToR') | |
| } | |
| res <- rep(NA, length = length(x)) | |
| res[pos] <- as.numeric(x[pos]) | |
| res <- as.Date(res, origin = os_origin) | |
| res | |
| } | |
| res_df[] <- lapply(seq_along(res_df), function(j) | |
| { | |
| switch(style_col[j], | |
| character = res_df[,j], | |
| numeric = as.numeric(res_df[,j]), | |
| date = number2date(res_df[,j]), | |
| time = strftime(as.POSIXct(as.numeric(res_df[,j]), origin = os_origin), format = "%H:%M:%S"), | |
| datetime = as.POSIXct(as.numeric(res_df[,j]), origin = os_origin)) | |
| }) | |
| if (empty_row) | |
| { | |
| pos <- apply(res_df, 1, function(x) sum(is.na(x))) != ncol(res_df) | |
| if (ncol(res_df) == 1) | |
| { | |
| col_name <- names(res_df) | |
| res_df <- data.frame(res_df[pos,]) | |
| names(res_df) <- col_name | |
| } else | |
| { | |
| res_df <- res_df[pos,] | |
| } | |
| } | |
| sheet_n <- sheet_names$name[sheet_names$id == i] | |
| worksheets[[sheet_n]] <- res_df | |
| }, warning = function(w) | |
| { | |
| print(style_col) | |
| stop(paste0('Warning messsage for sheet ', sheet_names$name[sheet_names$id == i], '\n', w)) | |
| }) | |
| } | |
| if(length(worksheets) == 1) | |
| { | |
| worksheets <- worksheets[[1]] | |
| } | |
| worksheets | |
| } | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment