Skip to content

Instantly share code, notes, and snippets.

@martinctc
Created August 12, 2025 14:54
Show Gist options
  • Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.
Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.
[Describe dataset using most common combinations of values]
#' @title
#' Analyze Categorical Variable Combinations to Describe Data Populations
#'
#' @description
#' This function analyzes categorical variables in a data frame to identify
#' the most common combinations of values. It generates all possible combinations
#' of the specified categorical variables (from single variables up to all
#' variables combined) and calculates their frequencies and proportions.
#'
#' The function is useful for understanding the composition of your data,
#' identifying the most representative groups, and discovering patterns in
#' categorical variable combinations.
#'
#' @param data A data frame containing the categorical variables to analyze.
#' @param variables Character vector containing the names of categorical variables
#' to analyze. All specified variables must exist as columns in the data frame.
#' @param return Character string specifying the return format. Options:
#' \itemize{
#' \item "combined" (default): Returns a single data frame with all combinations
#' \item "list": Returns a named list with separate data frames for each combination size
#' }
#'
#' @return Depending on the `return` parameter:
#' \itemize{
#' \item If "combined": A data frame with columns for variable names, values,
#' counts (n), and proportions (pct)
#' \item If "list": A named list where each element contains combinations of
#' that size (e.g., "1_variable", "2_variables", etc.)
#' }
#'
#' @details
#' The function treats NA values as a distinct category. Each combination is
#' counted based on unique rows in the data frame, and proportions are calculated
#' relative to the total number of rows.
#'
#' For each combination size, all possible permutations of variables are considered,
#' and results are sorted by frequency in descending order.
#'
#' @examples
#' # Create simulated employee data
#' set.seed(123)
#' employee_data <- data.frame(
#' employee_id = 1:1000,
#' department = sample(c("Engineering", "Sales", "Marketing", "HR"),
#' 1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
#' level = sample(c("Junior", "Senior", "Manager", "Director"),
#' 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)),
#' location = sample(c("Seattle", "New York", "Austin", "Remote"),
#' 1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)),
#' team_size = sample(c("Small", "Medium", "Large"),
#' 1000, replace = TRUE)
#' )
#'
#' # Analyze department, level, and location combinations
#' results <- describe_categorical_combinations(
#' data = employee_data,
#' variables = c("department", "level", "location"),
#' return = "combined"
#' )
#'
#' # View top combinations
#' head(results, 10)
#'
#' # Get results as a list by combination size
#' results_list <- describe_categorical_combinations(
#' data = employee_data,
#' variables = c("department", "level"),
#' return = "list"
#' )
#'
#' # View single variable summaries
#' results_list$`1_variable`
#'
#' # View two-variable combinations
#' head(results_list$`2_variables`)
#'
#' @import dplyr
#' @import purrr
#' @export
describe_categorical_combinations <- function(data,
variables,
return = c("combined", "list")) {
# Input validation
if (!is.data.frame(data)) {
stop("'data' must be a data frame")
}
if (!is.character(variables) || length(variables) == 0) {
stop("'variables' must be a non-empty character vector")
}
if (any(!variables %in% colnames(data))) {
missing_vars <- variables[!variables %in% colnames(data)]
stop("The following variables are not found in the data: ",
paste(missing_vars, collapse = ", "))
}
return <- match.arg(return)
# Select only the specified variables and remove duplicates
analysis_data <- data %>%
dplyr::select(dplyr::all_of(variables)) %>%
unique()
total_rows <- nrow(analysis_data)
if (total_rows == 0) {
stop("No data rows available for analysis")
}
# Generate combinations for each size
max_vars <- length(variables)
# Function to generate combinations for a given size
generate_combinations <- function(n_vars) {
if (n_vars == 1) {
# Single variable combinations
single_results <- variables %>%
purrr::map_dfr(function(var) {
analysis_data %>%
dplyr::count(.data[[var]], name = "n") %>%
dplyr::mutate(
pct = n / total_rows,
combination_size = 1L,
var_1 = var,
val_1 = .data[[var]]
) %>%
dplyr::select(-dplyr::all_of(var)) %>%
dplyr::arrange(dplyr::desc(n))
})
return(single_results)
}
# Multi-variable combinations
var_combinations <- combn(variables, n_vars, simplify = FALSE)
results <- var_combinations %>%
purrr::map_dfr(function(var_combo) {
# Create grouping variables
group_vars <- var_combo
# Calculate counts for this combination
result <- analysis_data %>%
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::mutate(
pct = n / total_rows,
combination_size = n_vars
) %>%
dplyr::arrange(dplyr::desc(n))
# Add variable name and value columns
for (i in seq_along(var_combo)) {
var_name <- paste0("var_", i)
val_name <- paste0("val_", i)
result[[var_name]] <- var_combo[i]
result[[val_name]] <- result[[var_combo[i]]]
}
# Remove original variable columns and reorder
var_cols <- paste0("var_", 1:n_vars)
val_cols <- paste0("val_", 1:n_vars)
result %>%
dplyr::select(-dplyr::all_of(var_combo)) %>%
dplyr::select(combination_size, dplyr::all_of(var_cols),
dplyr::all_of(val_cols), n, pct)
})
return(results)
}
# Generate all combination sizes
all_results <- 1:max_vars %>%
purrr::map(generate_combinations) %>%
purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s")))
# Return based on specified format
if (return == "list") {
return(all_results)
} else {
# Combine all results into a single data frame
combined_results <- all_results %>%
dplyr::bind_rows()
return(combined_results)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment