Created
August 12, 2025 14:54
-
-
Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.
Revisions
-
martinctc created this gist
Aug 12, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,189 @@ #' @title #' Analyze Categorical Variable Combinations to Describe Data Populations #' #' @description #' This function analyzes categorical variables in a data frame to identify #' the most common combinations of values. It generates all possible combinations #' of the specified categorical variables (from single variables up to all #' variables combined) and calculates their frequencies and proportions. #' #' The function is useful for understanding the composition of your data, #' identifying the most representative groups, and discovering patterns in #' categorical variable combinations. #' #' @param data A data frame containing the categorical variables to analyze. #' @param variables Character vector containing the names of categorical variables #' to analyze. All specified variables must exist as columns in the data frame. #' @param return Character string specifying the return format. Options: #' \itemize{ #' \item "combined" (default): Returns a single data frame with all combinations #' \item "list": Returns a named list with separate data frames for each combination size #' } #' #' @return Depending on the `return` parameter: #' \itemize{ #' \item If "combined": A data frame with columns for variable names, values, #' counts (n), and proportions (pct) #' \item If "list": A named list where each element contains combinations of #' that size (e.g., "1_variable", "2_variables", etc.) #' } #' #' @details #' The function treats NA values as a distinct category. Each combination is #' counted based on unique rows in the data frame, and proportions are calculated #' relative to the total number of rows. #' #' For each combination size, all possible permutations of variables are considered, #' and results are sorted by frequency in descending order. #' #' @examples #' # Create simulated employee data #' set.seed(123) #' employee_data <- data.frame( #' employee_id = 1:1000, #' department = sample(c("Engineering", "Sales", "Marketing", "HR"), #' 1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)), #' level = sample(c("Junior", "Senior", "Manager", "Director"), #' 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)), #' location = sample(c("Seattle", "New York", "Austin", "Remote"), #' 1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)), #' team_size = sample(c("Small", "Medium", "Large"), #' 1000, replace = TRUE) #' ) #' #' # Analyze department, level, and location combinations #' results <- describe_categorical_combinations( #' data = employee_data, #' variables = c("department", "level", "location"), #' return = "combined" #' ) #' #' # View top combinations #' head(results, 10) #' #' # Get results as a list by combination size #' results_list <- describe_categorical_combinations( #' data = employee_data, #' variables = c("department", "level"), #' return = "list" #' ) #' #' # View single variable summaries #' results_list$`1_variable` #' #' # View two-variable combinations #' head(results_list$`2_variables`) #' #' @import dplyr #' @import purrr #' @export describe_categorical_combinations <- function(data, variables, return = c("combined", "list")) { # Input validation if (!is.data.frame(data)) { stop("'data' must be a data frame") } if (!is.character(variables) || length(variables) == 0) { stop("'variables' must be a non-empty character vector") } if (any(!variables %in% colnames(data))) { missing_vars <- variables[!variables %in% colnames(data)] stop("The following variables are not found in the data: ", paste(missing_vars, collapse = ", ")) } return <- match.arg(return) # Select only the specified variables and remove duplicates analysis_data <- data %>% dplyr::select(dplyr::all_of(variables)) %>% unique() total_rows <- nrow(analysis_data) if (total_rows == 0) { stop("No data rows available for analysis") } # Generate combinations for each size max_vars <- length(variables) # Function to generate combinations for a given size generate_combinations <- function(n_vars) { if (n_vars == 1) { # Single variable combinations single_results <- variables %>% purrr::map_dfr(function(var) { analysis_data %>% dplyr::count(.data[[var]], name = "n") %>% dplyr::mutate( pct = n / total_rows, combination_size = 1L, var_1 = var, val_1 = .data[[var]] ) %>% dplyr::select(-dplyr::all_of(var)) %>% dplyr::arrange(dplyr::desc(n)) }) return(single_results) } # Multi-variable combinations var_combinations <- combn(variables, n_vars, simplify = FALSE) results <- var_combinations %>% purrr::map_dfr(function(var_combo) { # Create grouping variables group_vars <- var_combo # Calculate counts for this combination result <- analysis_data %>% dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>% dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% dplyr::mutate( pct = n / total_rows, combination_size = n_vars ) %>% dplyr::arrange(dplyr::desc(n)) # Add variable name and value columns for (i in seq_along(var_combo)) { var_name <- paste0("var_", i) val_name <- paste0("val_", i) result[[var_name]] <- var_combo[i] result[[val_name]] <- result[[var_combo[i]]] } # Remove original variable columns and reorder var_cols <- paste0("var_", 1:n_vars) val_cols <- paste0("val_", 1:n_vars) result %>% dplyr::select(-dplyr::all_of(var_combo)) %>% dplyr::select(combination_size, dplyr::all_of(var_cols), dplyr::all_of(val_cols), n, pct) }) return(results) } # Generate all combination sizes all_results <- 1:max_vars %>% purrr::map(generate_combinations) %>% purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s"))) # Return based on specified format if (return == "list") { return(all_results) } else { # Combine all results into a single data frame combined_results <- all_results %>% dplyr::bind_rows() return(combined_results) } }