Skip to content

Instantly share code, notes, and snippets.

@quantumdolphin
Last active July 31, 2025 01:34
Show Gist options
  • Save quantumdolphin/460179c14f5cbedcbd18ff96eb519892 to your computer and use it in GitHub Desktop.
Save quantumdolphin/460179c14f5cbedcbd18ff96eb519892 to your computer and use it in GitHub Desktop.
Rich Column-Wise EDA Report Function (quick_eda_summary)
"""
quick_eda_summary.py
A handy EDA (exploratory data analysis) summary function for pandas DataFrames.
Features:
- Prints and optionally writes out a rich summary of each column (numerical and categorical).
- Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc.
- Optionally pretty-prints using tabulate, and can write output to a text file.
How to use:
quick_eda_summary(df, to_file="summary.txt")
Notes:
- Output file is overwritten if it already exists.
- The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0.
If you see a TypeError, remove this argument.
"""
import pandas as pd
import numpy as np
def quick_eda_summary(
df,
max_cols=30,
max_cat_display=3,
to_file=None,
return_str=False
):
try:
from tabulate import tabulate
use_tabulate = True
except ImportError:
use_tabulate = False
out = []
shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns."
out.append(shape_msg)
out.append("=== Quick EDA Summary ===\n")
# Compute df.describe(include='all') ONCE for all types
desc_all = df.describe(include="all")
if df.shape[1] > max_cols:
msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n"
out.append(msg)
columns = df.columns[:max_cols]
else:
columns = df.columns
for col in columns:
col_data = df[col]
n_missing = col_data.isnull().sum()
missing_pct = 100 * n_missing / len(df)
describe_vals = desc_all[col] if col in desc_all else {}
# Detect type
if pd.api.types.is_numeric_dtype(col_data):
col_type = "Numerical"
#elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated
elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object:
col_type = "Categorical"
else:
col_type = str(col_data.dtype)
section = [f"--- {col} ({col_type}) ---"]
# Report missing values (ALWAYS show if missing, even if all missing)
section.append(f" Missing values: {n_missing} ({missing_pct:.1f}%)")
if n_missing == len(df):
section.append(" ⚠️ All values are missing.")
out.extend(section)
out.append("")
continue
n_unique = col_data.nunique(dropna=True)
if n_unique == 0:
section.append(" ⚠️ No unique (non-missing) values.")
out.extend(section)
out.append("")
continue
if n_unique == 1:
section.append(" ⚠️ Only one unique value (constant column).")
out.extend(section)
out.append("")
continue
# Numerical
if col_type == "Numerical":
def safe_float(val): # clean up nans and missing
try:
return float(val)
except Exception:
return np.nan
min_ = safe_float(describe_vals.get("min", np.nan))
q1 = safe_float(describe_vals.get("25%", np.nan))
median = safe_float(describe_vals.get("50%", np.nan))
q3 = safe_float(describe_vals.get("75%", np.nan))
max_ = safe_float(describe_vals.get("max", np.nan))
mean = safe_float(describe_vals.get("mean", np.nan))
std = safe_float(describe_vals.get("std", np.nan))
skew = col_data.skew()
kurt = col_data.kurtosis()
lines = [
f" 5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}",
f" Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}",
f" Range: {min_:.2f} to {max_:.2f}"
]
if np.abs(mean - median) > 0.3 * std:
lines.append(" ⚠️ Mean and median differ a lot (skew/outliers suspected)")
if kurt > 3:
lines.append(" ⚠️ High kurtosis (heavy tails/outliers)")
section.extend(lines)
# Categorical
elif col_type == "Categorical":
n_unique_desc = describe_vals.get("unique", None)
section.append(f" Unique values: {n_unique} (describe: {n_unique_desc})")
if n_unique <= 10:
section.append(" ✅ Few unique values (good for grouping/one-hot encoding)")
if n_unique > 100:
section.append(" ⚠️ High cardinality (many unique values)")
value_counts = col_data.value_counts(dropna=True)
top = describe_vals.get("top", None)
freq = describe_vals.get("freq", None)
if not value_counts.empty:
top_vals = value_counts.iloc[:max_cat_display]
if use_tabulate:
table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain")
section.append(" Top values:\n" + "\n".join(" " + line for line in table.splitlines()))
else:
section.append(" Top values:")
for val, count in top_vals.items():
section.append(f" {val} ({count})")
if top is not None and freq is not None:
section.append(f" Most frequent (describe): {top} ({freq} times)")
out.extend(section)
out.append("") # Blank line between columns
out.append("=== Done ===")
# Print to stdout
print("\n".join(out))
# Optionally write to file
if to_file is not None:
with open(to_file, "w", encoding="utf-8") as f:
f.write("\n".join(out))
if return_str:
return "\n".join(out)
DataFrame has 150 rows and 5 columns.
=== Quick EDA Summary ===
--- sepal_length (Numerical) ---
Missing values: 0 (0.0%)
5-number summary: min=4.30, Q1=5.10, median=5.80, Q3=6.40, max=7.90
Mean=5.84, Std=0.83, Skew=0.31, Kurtosis=-0.55
Range: 4.30 to 7.90
--- sepal_width (Numerical) ---
Missing values: 0 (0.0%)
5-number summary: min=2.00, Q1=2.80, median=3.00, Q3=3.30, max=4.40
Mean=3.05, Std=0.43, Skew=0.33, Kurtosis=0.29
Range: 2.00 to 4.40
--- petal_length (Numerical) ---
Missing values: 0 (0.0%)
5-number summary: min=1.00, Q1=1.60, median=4.35, Q3=5.10, max=6.90
Mean=3.76, Std=1.76, Skew=-0.27, Kurtosis=-1.40
Range: 1.00 to 6.90
⚠️ Mean and median differ a lot (skew/outliers suspected)
--- petal_width (Numerical) ---
Missing values: 0 (0.0%)
5-number summary: min=0.10, Q1=0.30, median=1.30, Q3=1.80, max=2.50
Mean=1.20, Std=0.76, Skew=-0.10, Kurtosis=-1.34
Range: 0.10 to 2.50
--- species (Categorical) ---
Missing values: 0 (0.0%)
Unique values: 3 (describe: 3)
✅ Few unique values (good for grouping/one-hot encoding)
Top values:
setosa (50)
versicolor (50)
virginica (50)
Most frequent (describe): setosa (50 times)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment