Skip to content

Instantly share code, notes, and snippets.

@martinctc
Created August 12, 2025 14:54
Show Gist options
  • Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.
Save martinctc/b4d7c5f73f5b264ba3d06d13bf2dcf6c to your computer and use it in GitHub Desktop.

Revisions

  1. martinctc created this gist Aug 12, 2025.
    189 changes: 189 additions & 0 deletions describe_categorical_combinations.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,189 @@
    #' @title
    #' Analyze Categorical Variable Combinations to Describe Data Populations
    #'
    #' @description
    #' This function analyzes categorical variables in a data frame to identify
    #' the most common combinations of values. It generates all possible combinations
    #' of the specified categorical variables (from single variables up to all
    #' variables combined) and calculates their frequencies and proportions.
    #'
    #' The function is useful for understanding the composition of your data,
    #' identifying the most representative groups, and discovering patterns in
    #' categorical variable combinations.
    #'
    #' @param data A data frame containing the categorical variables to analyze.
    #' @param variables Character vector containing the names of categorical variables
    #' to analyze. All specified variables must exist as columns in the data frame.
    #' @param return Character string specifying the return format. Options:
    #' \itemize{
    #' \item "combined" (default): Returns a single data frame with all combinations
    #' \item "list": Returns a named list with separate data frames for each combination size
    #' }
    #'
    #' @return Depending on the `return` parameter:
    #' \itemize{
    #' \item If "combined": A data frame with columns for variable names, values,
    #' counts (n), and proportions (pct)
    #' \item If "list": A named list where each element contains combinations of
    #' that size (e.g., "1_variable", "2_variables", etc.)
    #' }
    #'
    #' @details
    #' The function treats NA values as a distinct category. Each combination is
    #' counted based on unique rows in the data frame, and proportions are calculated
    #' relative to the total number of rows.
    #'
    #' For each combination size, all possible permutations of variables are considered,
    #' and results are sorted by frequency in descending order.
    #'
    #' @examples
    #' # Create simulated employee data
    #' set.seed(123)
    #' employee_data <- data.frame(
    #' employee_id = 1:1000,
    #' department = sample(c("Engineering", "Sales", "Marketing", "HR"),
    #' 1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
    #' level = sample(c("Junior", "Senior", "Manager", "Director"),
    #' 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)),
    #' location = sample(c("Seattle", "New York", "Austin", "Remote"),
    #' 1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)),
    #' team_size = sample(c("Small", "Medium", "Large"),
    #' 1000, replace = TRUE)
    #' )
    #'
    #' # Analyze department, level, and location combinations
    #' results <- describe_categorical_combinations(
    #' data = employee_data,
    #' variables = c("department", "level", "location"),
    #' return = "combined"
    #' )
    #'
    #' # View top combinations
    #' head(results, 10)
    #'
    #' # Get results as a list by combination size
    #' results_list <- describe_categorical_combinations(
    #' data = employee_data,
    #' variables = c("department", "level"),
    #' return = "list"
    #' )
    #'
    #' # View single variable summaries
    #' results_list$`1_variable`
    #'
    #' # View two-variable combinations
    #' head(results_list$`2_variables`)
    #'
    #' @import dplyr
    #' @import purrr
    #' @export
    describe_categorical_combinations <- function(data,
    variables,
    return = c("combined", "list")) {

    # Input validation
    if (!is.data.frame(data)) {
    stop("'data' must be a data frame")
    }

    if (!is.character(variables) || length(variables) == 0) {
    stop("'variables' must be a non-empty character vector")
    }

    if (any(!variables %in% colnames(data))) {
    missing_vars <- variables[!variables %in% colnames(data)]
    stop("The following variables are not found in the data: ",
    paste(missing_vars, collapse = ", "))
    }

    return <- match.arg(return)

    # Select only the specified variables and remove duplicates
    analysis_data <- data %>%
    dplyr::select(dplyr::all_of(variables)) %>%
    unique()

    total_rows <- nrow(analysis_data)

    if (total_rows == 0) {
    stop("No data rows available for analysis")
    }

    # Generate combinations for each size
    max_vars <- length(variables)

    # Function to generate combinations for a given size
    generate_combinations <- function(n_vars) {
    if (n_vars == 1) {
    # Single variable combinations
    single_results <- variables %>%
    purrr::map_dfr(function(var) {
    analysis_data %>%
    dplyr::count(.data[[var]], name = "n") %>%
    dplyr::mutate(
    pct = n / total_rows,
    combination_size = 1L,
    var_1 = var,
    val_1 = .data[[var]]
    ) %>%
    dplyr::select(-dplyr::all_of(var)) %>%
    dplyr::arrange(dplyr::desc(n))
    })
    return(single_results)
    }

    # Multi-variable combinations
    var_combinations <- combn(variables, n_vars, simplify = FALSE)

    results <- var_combinations %>%
    purrr::map_dfr(function(var_combo) {
    # Create grouping variables
    group_vars <- var_combo

    # Calculate counts for this combination
    result <- analysis_data %>%
    dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::mutate(
    pct = n / total_rows,
    combination_size = n_vars
    ) %>%
    dplyr::arrange(dplyr::desc(n))

    # Add variable name and value columns
    for (i in seq_along(var_combo)) {
    var_name <- paste0("var_", i)
    val_name <- paste0("val_", i)
    result[[var_name]] <- var_combo[i]
    result[[val_name]] <- result[[var_combo[i]]]
    }

    # Remove original variable columns and reorder
    var_cols <- paste0("var_", 1:n_vars)
    val_cols <- paste0("val_", 1:n_vars)

    result %>%
    dplyr::select(-dplyr::all_of(var_combo)) %>%
    dplyr::select(combination_size, dplyr::all_of(var_cols),
    dplyr::all_of(val_cols), n, pct)
    })

    return(results)
    }

    # Generate all combination sizes
    all_results <- 1:max_vars %>%
    purrr::map(generate_combinations) %>%
    purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s")))

    # Return based on specified format
    if (return == "list") {
    return(all_results)
    } else {
    # Combine all results into a single data frame
    combined_results <- all_results %>%
    dplyr::bind_rows()

    return(combined_results)
    }
    }