""" quick_eda_summary.py A handy EDA (exploratory data analysis) summary function for pandas DataFrames. Features: - Prints and optionally writes out a rich summary of each column (numerical and categorical). - Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc. - Optionally pretty-prints using tabulate, and can write output to a text file. How to use: quick_eda_summary(df, to_file="summary.txt") Notes: - Output file is overwritten if it already exists. - The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0. If you see a TypeError, remove this argument. """ import pandas as pd import numpy as np def quick_eda_summary( df, max_cols=30, max_cat_display=3, to_file=None, return_str=False ): try: from tabulate import tabulate use_tabulate = True except ImportError: use_tabulate = False out = [] shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns." out.append(shape_msg) out.append("=== Quick EDA Summary ===\n") # Compute df.describe(include='all') ONCE for all types desc_all = df.describe(include="all") if df.shape[1] > max_cols: msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n" out.append(msg) columns = df.columns[:max_cols] else: columns = df.columns for col in columns: col_data = df[col] n_missing = col_data.isnull().sum() missing_pct = 100 * n_missing / len(df) describe_vals = desc_all[col] if col in desc_all else {} # Detect type if pd.api.types.is_numeric_dtype(col_data): col_type = "Numerical" #elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object: col_type = "Categorical" else: col_type = str(col_data.dtype) section = [f"--- {col} ({col_type}) ---"] # Report missing values (ALWAYS show if missing, even if all missing) section.append(f" Missing values: {n_missing} ({missing_pct:.1f}%)") if n_missing == len(df): section.append(" ⚠️ All values are missing.") out.extend(section) out.append("") continue n_unique = col_data.nunique(dropna=True) if n_unique == 0: section.append(" ⚠️ No unique (non-missing) values.") out.extend(section) out.append("") continue if n_unique == 1: section.append(" ⚠️ Only one unique value (constant column).") out.extend(section) out.append("") continue # Numerical if col_type == "Numerical": def safe_float(val): # clean up nans and missing try: return float(val) except Exception: return np.nan min_ = safe_float(describe_vals.get("min", np.nan)) q1 = safe_float(describe_vals.get("25%", np.nan)) median = safe_float(describe_vals.get("50%", np.nan)) q3 = safe_float(describe_vals.get("75%", np.nan)) max_ = safe_float(describe_vals.get("max", np.nan)) mean = safe_float(describe_vals.get("mean", np.nan)) std = safe_float(describe_vals.get("std", np.nan)) skew = col_data.skew() kurt = col_data.kurtosis() lines = [ f" 5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}", f" Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}", f" Range: {min_:.2f} to {max_:.2f}" ] if np.abs(mean - median) > 0.3 * std: lines.append(" ⚠️ Mean and median differ a lot (skew/outliers suspected)") if kurt > 3: lines.append(" ⚠️ High kurtosis (heavy tails/outliers)") section.extend(lines) # Categorical elif col_type == "Categorical": n_unique_desc = describe_vals.get("unique", None) section.append(f" Unique values: {n_unique} (describe: {n_unique_desc})") if n_unique <= 10: section.append(" ✅ Few unique values (good for grouping/one-hot encoding)") if n_unique > 100: section.append(" ⚠️ High cardinality (many unique values)") value_counts = col_data.value_counts(dropna=True) top = describe_vals.get("top", None) freq = describe_vals.get("freq", None) if not value_counts.empty: top_vals = value_counts.iloc[:max_cat_display] if use_tabulate: table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain") section.append(" Top values:\n" + "\n".join(" " + line for line in table.splitlines())) else: section.append(" Top values:") for val, count in top_vals.items(): section.append(f" {val} ({count})") if top is not None and freq is not None: section.append(f" Most frequent (describe): {top} ({freq} times)") out.extend(section) out.append("") # Blank line between columns out.append("=== Done ===") # Print to stdout print("\n".join(out)) # Optionally write to file if to_file is not None: with open(to_file, "w", encoding="utf-8") as f: f.write("\n".join(out)) if return_str: return "\n".join(out)