martinctc · August 12, 2025 14:54
diff --git a/describe_categorical_combinations.R b/describe_categorical_combinations.R
 #' @title
 #' Analyze Categorical Variable Combinations to Describe Data Populations
 #'
 #' @description
 #' This function analyzes categorical variables in a data frame to identify
 #' the most common combinations of values. It generates all possible combinations
 #' of the specified categorical variables (from single variables up to all
 #' variables combined) and calculates their frequencies and proportions.
 #' 
 #' The function is useful for understanding the composition of your data,
 #' identifying the most representative groups, and discovering patterns in
 #' categorical variable combinations.
 #'
 #' @param data A data frame containing the categorical variables to analyze.
 #' @param variables Character vector containing the names of categorical variables
 #'   to analyze. All specified variables must exist as columns in the data frame.
 #' @param return Character string specifying the return format. Options:
 #'   \itemize{
 #'     \item "combined" (default): Returns a single data frame with all combinations
 #'     \item "list": Returns a named list with separate data frames for each combination size
 #'   }
 #'
 #' @return Depending on the `return` parameter:
 #'   \itemize{
 #'     \item If "combined": A data frame with columns for variable names, values, 
 #'       counts (n), and proportions (pct)
 #'     \item If "list": A named list where each element contains combinations of 
 #'       that size (e.g., "1_variable", "2_variables", etc.)
 #'   }
 #'
 #' @details
 #' The function treats NA values as a distinct category. Each combination is
 #' counted based on unique rows in the data frame, and proportions are calculated
 #' relative to the total number of rows.
 #'
 #' For each combination size, all possible permutations of variables are considered,
 #' and results are sorted by frequency in descending order.
 #'
 #' @examples
 #' # Create simulated employee data
 #' set.seed(123)
 #' employee_data <- data.frame(
 #'   employee_id = 1:1000,
 #'   department = sample(c("Engineering", "Sales", "Marketing", "HR"), 
 #'                      1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
 #'   level = sample(c("Junior", "Senior", "Manager", "Director"), 
 #'                 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)),
 #'   location = sample(c("Seattle", "New York", "Austin", "Remote"), 
 #'                    1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)),
 #'   team_size = sample(c("Small", "Medium", "Large"), 
 #'                     1000, replace = TRUE)
 #' )
 #'
 #' # Analyze department, level, and location combinations
 #' results <- describe_categorical_combinations(
 #'   data = employee_data,
 #'   variables = c("department", "level", "location"),
 #'   return = "combined"
 #' )
 #' 
 #' # View top combinations
 #' head(results, 10)
 #' 
 #' # Get results as a list by combination size
 #' results_list <- describe_categorical_combinations(
 #'   data = employee_data,
 #'   variables = c("department", "level"),
 #'   return = "list"
 #' )
 #' 
 #' # View single variable summaries
 #' results_list$`1_variable`
 #' 
 #' # View two-variable combinations
 #' head(results_list$`2_variables`)
 #'
 #' @import dplyr
 #' @import purrr
 #' @export
 describe_categorical_combinations <- function(data,
                                            variables,
                                            return = c("combined", "list")) {
  
  # Input validation
  if (!is.data.frame(data)) {
    stop("'data' must be a data frame")
  }
  
  if (!is.character(variables) || length(variables) == 0) {
    stop("'variables' must be a non-empty character vector")
  }
  
  if (any(!variables %in% colnames(data))) {
    missing_vars <- variables[!variables %in% colnames(data)]
    stop("The following variables are not found in the data: ", 
         paste(missing_vars, collapse = ", "))
  }
  
  return <- match.arg(return)
  
  # Select only the specified variables and remove duplicates
  analysis_data <- data %>%
    dplyr::select(dplyr::all_of(variables)) %>%
    unique()
  
  total_rows <- nrow(analysis_data)
  
  if (total_rows == 0) {
    stop("No data rows available for analysis")
  }
  
  # Generate combinations for each size
  max_vars <- length(variables)
  
  # Function to generate combinations for a given size
  generate_combinations <- function(n_vars) {
    if (n_vars == 1) {
      # Single variable combinations
      single_results <- variables %>%
        purrr::map_dfr(function(var) {
          analysis_data %>%
            dplyr::count(.data[[var]], name = "n") %>%
            dplyr::mutate(
              pct = n / total_rows,
              combination_size = 1L,
              var_1 = var,
              val_1 = .data[[var]]
            ) %>%
            dplyr::select(-dplyr::all_of(var)) %>%
            dplyr::arrange(dplyr::desc(n))
        })
      return(single_results)
    }
    
    # Multi-variable combinations
    var_combinations <- combn(variables, n_vars, simplify = FALSE)
    
    results <- var_combinations %>%
      purrr::map_dfr(function(var_combo) {
        # Create grouping variables
        group_vars <- var_combo
        
        # Calculate counts for this combination
        result <- analysis_data %>%
          dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
          dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
          dplyr::mutate(
            pct = n / total_rows,
            combination_size = n_vars
          ) %>%
          dplyr::arrange(dplyr::desc(n))
        
        # Add variable name and value columns
        for (i in seq_along(var_combo)) {
          var_name <- paste0("var_", i)
          val_name <- paste0("val_", i)
          result[[var_name]] <- var_combo[i]
          result[[val_name]] <- result[[var_combo[i]]]
        }
        
        # Remove original variable columns and reorder
        var_cols <- paste0("var_", 1:n_vars)
        val_cols <- paste0("val_", 1:n_vars)
        
        result %>%
          dplyr::select(-dplyr::all_of(var_combo)) %>%
          dplyr::select(combination_size, dplyr::all_of(var_cols), 
                       dplyr::all_of(val_cols), n, pct)
      })
    
    return(results)
  }
  
  # Generate all combination sizes
  all_results <- 1:max_vars %>%
    purrr::map(generate_combinations) %>%
    purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s")))
  
  # Return based on specified format
  if (return == "list") {
    return(all_results)
  } else {
    # Combine all results into a single data frame
    combined_results <- all_results %>%
      dplyr::bind_rows()
    
    return(combined_results)
  }
 }
	#' @title
	#' Analyze Categorical Variable Combinations to Describe Data Populations
	#'
	#' @description
	#' This function analyzes categorical variables in a data frame to identify
	#' the most common combinations of values. It generates all possible combinations
	#' of the specified categorical variables (from single variables up to all
	#' variables combined) and calculates their frequencies and proportions.
	#'
	#' The function is useful for understanding the composition of your data,
	#' identifying the most representative groups, and discovering patterns in
	#' categorical variable combinations.
	#'
	#' @param data A data frame containing the categorical variables to analyze.
	#' @param variables Character vector containing the names of categorical variables
	#' to analyze. All specified variables must exist as columns in the data frame.
	#' @param return Character string specifying the return format. Options:
	#' \itemize{
	#' \item "combined" (default): Returns a single data frame with all combinations
	#' \item "list": Returns a named list with separate data frames for each combination size
	#' }
	#'
	#' @return Depending on the `return` parameter:
	#' \itemize{
	#' \item If "combined": A data frame with columns for variable names, values,
	#' counts (n), and proportions (pct)
	#' \item If "list": A named list where each element contains combinations of
	#' that size (e.g., "1_variable", "2_variables", etc.)
	#' }
	#'
	#' @details
	#' The function treats NA values as a distinct category. Each combination is
	#' counted based on unique rows in the data frame, and proportions are calculated
	#' relative to the total number of rows.
	#'
	#' For each combination size, all possible permutations of variables are considered,
	#' and results are sorted by frequency in descending order.
	#'
	#' @examples
	#' # Create simulated employee data
	#' set.seed(123)
	#' employee_data <- data.frame(
	#' employee_id = 1:1000,
	#' department = sample(c("Engineering", "Sales", "Marketing", "HR"),
	#' 1000, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1)),
	#' level = sample(c("Junior", "Senior", "Manager", "Director"),
	#' 1000, replace = TRUE, prob = c(0.5, 0.3, 0.15, 0.05)),
	#' location = sample(c("Seattle", "New York", "Austin", "Remote"),
	#' 1000, replace = TRUE, prob = c(0.35, 0.25, 0.25, 0.15)),
	#' team_size = sample(c("Small", "Medium", "Large"),
	#' 1000, replace = TRUE)
	#' )
	#'
	#' # Analyze department, level, and location combinations
	#' results <- describe_categorical_combinations(
	#' data = employee_data,
	#' variables = c("department", "level", "location"),
	#' return = "combined"
	#' )
	#'
	#' # View top combinations
	#' head(results, 10)
	#'
	#' # Get results as a list by combination size
	#' results_list <- describe_categorical_combinations(
	#' data = employee_data,
	#' variables = c("department", "level"),
	#' return = "list"
	#' )
	#'
	#' # View single variable summaries
	#' results_list$`1_variable`
	#'
	#' # View two-variable combinations
	#' head(results_list$`2_variables`)
	#'
	#' @import dplyr
	#' @import purrr
	#' @export
	describe_categorical_combinations <- function(data,
	variables,
	return = c("combined", "list")) {

	# Input validation
	if (!is.data.frame(data)) {
	stop("'data' must be a data frame")
	}

	if (!is.character(variables) \|\| length(variables) == 0) {
	stop("'variables' must be a non-empty character vector")
	}

	if (any(!variables %in% colnames(data))) {
	missing_vars <- variables[!variables %in% colnames(data)]
	stop("The following variables are not found in the data: ",
	paste(missing_vars, collapse = ", "))
	}

	return <- match.arg(return)

	# Select only the specified variables and remove duplicates
	analysis_data <- data %>%
	dplyr::select(dplyr::all_of(variables)) %>%
	unique()

	total_rows <- nrow(analysis_data)

	if (total_rows == 0) {
	stop("No data rows available for analysis")
	}

	# Generate combinations for each size
	max_vars <- length(variables)

	# Function to generate combinations for a given size
	generate_combinations <- function(n_vars) {
	if (n_vars == 1) {
	# Single variable combinations
	single_results <- variables %>%
	purrr::map_dfr(function(var) {
	analysis_data %>%
	dplyr::count(.data[[var]], name = "n") %>%
	dplyr::mutate(
	pct = n / total_rows,
	combination_size = 1L,
	var_1 = var,
	val_1 = .data[[var]]
	) %>%
	dplyr::select(-dplyr::all_of(var)) %>%
	dplyr::arrange(dplyr::desc(n))
	})
	return(single_results)
	}

	# Multi-variable combinations
	var_combinations <- combn(variables, n_vars, simplify = FALSE)

	results <- var_combinations %>%
	purrr::map_dfr(function(var_combo) {
	# Create grouping variables
	group_vars <- var_combo

	# Calculate counts for this combination
	result <- analysis_data %>%
	dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
	dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
	dplyr::mutate(
	pct = n / total_rows,
	combination_size = n_vars
	) %>%
	dplyr::arrange(dplyr::desc(n))

	# Add variable name and value columns
	for (i in seq_along(var_combo)) {
	var_name <- paste0("var_", i)
	val_name <- paste0("val_", i)
	result[[var_name]] <- var_combo[i]
	result[[val_name]] <- result[[var_combo[i]]]
	}

	# Remove original variable columns and reorder
	var_cols <- paste0("var_", 1:n_vars)
	val_cols <- paste0("val_", 1:n_vars)

	result %>%
	dplyr::select(-dplyr::all_of(var_combo)) %>%
	dplyr::select(combination_size, dplyr::all_of(var_cols),
	dplyr::all_of(val_cols), n, pct)
	})

	return(results)
	}

	# Generate all combination sizes
	all_results <- 1:max_vars %>%
	purrr::map(generate_combinations) %>%
	purrr::set_names(paste0(1:max_vars, "_variable", ifelse(1:max_vars == 1, "", "s")))

	# Return based on specified format
	if (return == "list") {
	return(all_results)
	} else {
	# Combine all results into a single data frame
	combined_results <- all_results %>%
	dplyr::bind_rows()

	return(combined_results)
	}
	}