martinctc · August 12, 2025 14:54 · Aug 12, 2025
diff --git a/describe_categorical_combinations.R b/describe_categorical_combinations.R
@@ -0,0 +1,189 @@
+#' @title
+#' Analyze Categorical Variable Combinations to Describe Data Populations
+#'
+#' @description
+#' This function analyzes categorical variables in a data frame to identify
+#' the most common combinations of values. It generates all possible combinations
+#' of the specified categorical variables (from single variables up to all
+#' variables combined) and calculates their frequencies and proportions.
+#' 
+#' The function is useful for understanding the composition of your data,
+#' identifying the most representative groups, and discovering patterns in
+#' categorical variable combinations.
+#'
+#' @param data A data frame containing the categorical variables to analyze.
+#' @param variables Character vector containing the names of categorical variables
+#'   to analyze. All specified variables must exist as columns in the data frame.
+#' @param return Character string specifying the return format. Options:
+#'   \itemize{
+#'     \item "combined" (default): Returns a single data frame with all combinations
+#'     \item "list": Returns a named list with separate data frames for each combination size
+#'   }
+#'
+#' @return Depending on the `return` parameter:
+#'   \itemize{
+#'     \item If "combined": A data frame with columns for variable names, values, 
+#'       counts (n), and proportions (pct)
+#'     \item If "list": A named list where each element contains combinations of 
+#'       that size (e.g., "1_variable", "2_variables", etc.)
+#'   }
+#'
+#' @details
+#' The function treats NA values as a distinct category. Each combination is
+#' counted based on unique rows in the data frame, and proportions are calculated
+#' relative to the total number of rows.
+#'
+#' For each combination size, all possible permutations of variables are considered,
+#' and results are sorted by frequency in descending order.
+#'
+#' @examples
+#' # Create simulated employee data
+#' set.seed(123)
+#' employee_data <- data.frame(
+#'   employee_id = 1:1000,
+#'   department = sample(c("Engineering", "Sales", "Marketing", "HR"), 
+#'                      1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
+#'   level = sample(c("Junior", "Senior", "Manager", "Director"), 
+#'                 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)),
+#'   location = sample(c("Seattle", "New York", "Austin", "Remote"), 
+#'                    1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)),
+#'   team_size = sample(c("Small", "Medium", "Large"), 
+#'                     1000, replace = TRUE)
+#' )
+#'
+#' # Analyze department, level, and location combinations
+#' results <- describe_categorical_combinations(
+#'   data = employee_data,
+#'   variables = c("department", "level", "location"),
+#'   return = "combined"
+#' )
+#' 
+#' # View top combinations
+#' head(results, 10)
+#' 
+#' # Get results as a list by combination size
+#' results_list <- describe_categorical_combinations(
+#'   data = employee_data,
+#'   variables = c("department", "level"),
+#'   return = "list"
+#' )
+#' 
+#' # View single variable summaries
+#' results_list$`1_variable`
+#' 
+#' # View two-variable combinations
+#' head(results_list$`2_variables`)
+#'
+#' @import dplyr
+#' @import purrr
+#' @export
+describe_categorical_combinations <- function(data,
+                                            variables,
+                                            return = c("combined", "list")) {
+
+  # Input validation
+  if (!is.data.frame(data)) {
+    stop("'data' must be a data frame")
+  }
+
+  if (!is.character(variables) || length(variables) == 0) {
+    stop("'variables' must be a non-empty character vector")
+  }
+
+  if (any(!variables %in% colnames(data))) {
+    missing_vars <- variables[!variables %in% colnames(data)]
+    stop("The following variables are not found in the data: ", 
+         paste(missing_vars, collapse = ", "))
+  }
+
+  return <- match.arg(return)
+
+  # Select only the specified variables and remove duplicates
+  analysis_data <- data %>%
+    dplyr::select(dplyr::all_of(variables)) %>%
+    unique()
+
+  total_rows <- nrow(analysis_data)
+
+  if (total_rows == 0) {
+    stop("No data rows available for analysis")
+  }
+
+  # Generate combinations for each size
+  max_vars <- length(variables)
+
+  # Function to generate combinations for a given size
+  generate_combinations <- function(n_vars) {
+    if (n_vars == 1) {
+      # Single variable combinations
+      single_results <- variables %>%
+        purrr::map_dfr(function(var) {
+          analysis_data %>%
+            dplyr::count(.data[[var]], name = "n") %>%
+            dplyr::mutate(
+              pct = n / total_rows,
+              combination_size = 1L,
+              var_1 = var,
+              val_1 = .data[[var]]
+            ) %>%
+            dplyr::select(-dplyr::all_of(var)) %>%
+            dplyr::arrange(dplyr::desc(n))
+        })
+      return(single_results)
+    }
+
+    # Multi-variable combinations
+    var_combinations <- combn(variables, n_vars, simplify = FALSE)
+
+    results <- var_combinations %>%
+      purrr::map_dfr(function(var_combo) {
+        # Create grouping variables
+        group_vars <- var_combo
+
+        # Calculate counts for this combination
+        result <- analysis_data %>%
+          dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
+          dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
+          dplyr::mutate(
+            pct = n / total_rows,
+            combination_size = n_vars
+          ) %>%
+          dplyr::arrange(dplyr::desc(n))
+
+        # Add variable name and value columns
+        for (i in seq_along(var_combo)) {
+          var_name <- paste0("var_", i)
+          val_name <- paste0("val_", i)
+          result[[var_name]] <- var_combo[i]
+          result[[val_name]] <- result[[var_combo[i]]]
+        }
+
+        # Remove original variable columns and reorder
+        var_cols <- paste0("var_", 1:n_vars)
+        val_cols <- paste0("val_", 1:n_vars)
+
+        result %>%
+          dplyr::select(-dplyr::all_of(var_combo)) %>%
+          dplyr::select(combination_size, dplyr::all_of(var_cols), 
+                       dplyr::all_of(val_cols), n, pct)
+      })
+
+    return(results)
+  }
+
+  # Generate all combination sizes
+  all_results <- 1:max_vars %>%
+    purrr::map(generate_combinations) %>%
+    purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s")))
+
+  # Return based on specified format
+  if (return == "list") {
+    return(all_results)
+  } else {
+    # Combine all results into a single data frame
+    combined_results <- all_results %>%
+      dplyr::bind_rows()
+
+    return(combined_results)
+  }
+}