Created
August 12, 2025 14:54
-
-
Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.
[Describe dataset using most common combinations of values]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' @title | |
| #' Analyze Categorical Variable Combinations to Describe Data Populations | |
| #' | |
| #' @description | |
| #' This function analyzes categorical variables in a data frame to identify | |
| #' the most common combinations of values. It generates all possible combinations | |
| #' of the specified categorical variables (from single variables up to all | |
| #' variables combined) and calculates their frequencies and proportions. | |
| #' | |
| #' The function is useful for understanding the composition of your data, | |
| #' identifying the most representative groups, and discovering patterns in | |
| #' categorical variable combinations. | |
| #' | |
| #' @param data A data frame containing the categorical variables to analyze. | |
| #' @param variables Character vector containing the names of categorical variables | |
| #' to analyze. All specified variables must exist as columns in the data frame. | |
| #' @param return Character string specifying the return format. Options: | |
| #' \itemize{ | |
| #' \item "combined" (default): Returns a single data frame with all combinations | |
| #' \item "list": Returns a named list with separate data frames for each combination size | |
| #' } | |
| #' | |
| #' @return Depending on the `return` parameter: | |
| #' \itemize{ | |
| #' \item If "combined": A data frame with columns for variable names, values, | |
| #' counts (n), and proportions (pct) | |
| #' \item If "list": A named list where each element contains combinations of | |
| #' that size (e.g., "1_variable", "2_variables", etc.) | |
| #' } | |
| #' | |
| #' @details | |
| #' The function treats NA values as a distinct category. Each combination is | |
| #' counted based on unique rows in the data frame, and proportions are calculated | |
| #' relative to the total number of rows. | |
| #' | |
| #' For each combination size, all possible permutations of variables are considered, | |
| #' and results are sorted by frequency in descending order. | |
| #' | |
| #' @examples | |
| #' # Create simulated employee data | |
| #' set.seed(123) | |
| #' employee_data <- data.frame( | |
| #' employee_id = 1:1000, | |
| #' department = sample(c("Engineering", "Sales", "Marketing", "HR"), | |
| #' 1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)), | |
| #' level = sample(c("Junior", "Senior", "Manager", "Director"), | |
| #' 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)), | |
| #' location = sample(c("Seattle", "New York", "Austin", "Remote"), | |
| #' 1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)), | |
| #' team_size = sample(c("Small", "Medium", "Large"), | |
| #' 1000, replace = TRUE) | |
| #' ) | |
| #' | |
| #' # Analyze department, level, and location combinations | |
| #' results <- describe_categorical_combinations( | |
| #' data = employee_data, | |
| #' variables = c("department", "level", "location"), | |
| #' return = "combined" | |
| #' ) | |
| #' | |
| #' # View top combinations | |
| #' head(results, 10) | |
| #' | |
| #' # Get results as a list by combination size | |
| #' results_list <- describe_categorical_combinations( | |
| #' data = employee_data, | |
| #' variables = c("department", "level"), | |
| #' return = "list" | |
| #' ) | |
| #' | |
| #' # View single variable summaries | |
| #' results_list$`1_variable` | |
| #' | |
| #' # View two-variable combinations | |
| #' head(results_list$`2_variables`) | |
| #' | |
| #' @import dplyr | |
| #' @import purrr | |
| #' @export | |
| describe_categorical_combinations <- function(data, | |
| variables, | |
| return = c("combined", "list")) { | |
| # Input validation | |
| if (!is.data.frame(data)) { | |
| stop("'data' must be a data frame") | |
| } | |
| if (!is.character(variables) || length(variables) == 0) { | |
| stop("'variables' must be a non-empty character vector") | |
| } | |
| if (any(!variables %in% colnames(data))) { | |
| missing_vars <- variables[!variables %in% colnames(data)] | |
| stop("The following variables are not found in the data: ", | |
| paste(missing_vars, collapse = ", ")) | |
| } | |
| return <- match.arg(return) | |
| # Select only the specified variables and remove duplicates | |
| analysis_data <- data %>% | |
| dplyr::select(dplyr::all_of(variables)) %>% | |
| unique() | |
| total_rows <- nrow(analysis_data) | |
| if (total_rows == 0) { | |
| stop("No data rows available for analysis") | |
| } | |
| # Generate combinations for each size | |
| max_vars <- length(variables) | |
| # Function to generate combinations for a given size | |
| generate_combinations <- function(n_vars) { | |
| if (n_vars == 1) { | |
| # Single variable combinations | |
| single_results <- variables %>% | |
| purrr::map_dfr(function(var) { | |
| analysis_data %>% | |
| dplyr::count(.data[[var]], name = "n") %>% | |
| dplyr::mutate( | |
| pct = n / total_rows, | |
| combination_size = 1L, | |
| var_1 = var, | |
| val_1 = .data[[var]] | |
| ) %>% | |
| dplyr::select(-dplyr::all_of(var)) %>% | |
| dplyr::arrange(dplyr::desc(n)) | |
| }) | |
| return(single_results) | |
| } | |
| # Multi-variable combinations | |
| var_combinations <- combn(variables, n_vars, simplify = FALSE) | |
| results <- var_combinations %>% | |
| purrr::map_dfr(function(var_combo) { | |
| # Create grouping variables | |
| group_vars <- var_combo | |
| # Calculate counts for this combination | |
| result <- analysis_data %>% | |
| dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>% | |
| dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% | |
| dplyr::mutate( | |
| pct = n / total_rows, | |
| combination_size = n_vars | |
| ) %>% | |
| dplyr::arrange(dplyr::desc(n)) | |
| # Add variable name and value columns | |
| for (i in seq_along(var_combo)) { | |
| var_name <- paste0("var_", i) | |
| val_name <- paste0("val_", i) | |
| result[[var_name]] <- var_combo[i] | |
| result[[val_name]] <- result[[var_combo[i]]] | |
| } | |
| # Remove original variable columns and reorder | |
| var_cols <- paste0("var_", 1:n_vars) | |
| val_cols <- paste0("val_", 1:n_vars) | |
| result %>% | |
| dplyr::select(-dplyr::all_of(var_combo)) %>% | |
| dplyr::select(combination_size, dplyr::all_of(var_cols), | |
| dplyr::all_of(val_cols), n, pct) | |
| }) | |
| return(results) | |
| } | |
| # Generate all combination sizes | |
| all_results <- 1:max_vars %>% | |
| purrr::map(generate_combinations) %>% | |
| purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s"))) | |
| # Return based on specified format | |
| if (return == "list") { | |
| return(all_results) | |
| } else { | |
| # Combine all results into a single data frame | |
| combined_results <- all_results %>% | |
| dplyr::bind_rows() | |
| return(combined_results) | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment