Last active
July 31, 2025 01:34
-
-
Save quantumdolphin/460179c14f5cbedcbd18ff96eb519892 to your computer and use it in GitHub Desktop.
Rich Column-Wise EDA Report Function (quick_eda_summary)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| quick_eda_summary.py | |
| A handy EDA (exploratory data analysis) summary function for pandas DataFrames. | |
| Features: | |
| - Prints and optionally writes out a rich summary of each column (numerical and categorical). | |
| - Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc. | |
| - Optionally pretty-prints using tabulate, and can write output to a text file. | |
| How to use: | |
| quick_eda_summary(df, to_file="summary.txt") | |
| Notes: | |
| - Output file is overwritten if it already exists. | |
| - The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0. | |
| If you see a TypeError, remove this argument. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| def quick_eda_summary( | |
| df, | |
| max_cols=30, | |
| max_cat_display=3, | |
| to_file=None, | |
| return_str=False | |
| ): | |
| try: | |
| from tabulate import tabulate | |
| use_tabulate = True | |
| except ImportError: | |
| use_tabulate = False | |
| out = [] | |
| shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns." | |
| out.append(shape_msg) | |
| out.append("=== Quick EDA Summary ===\n") | |
| # Compute df.describe(include='all') ONCE for all types | |
| desc_all = df.describe(include="all") | |
| if df.shape[1] > max_cols: | |
| msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n" | |
| out.append(msg) | |
| columns = df.columns[:max_cols] | |
| else: | |
| columns = df.columns | |
| for col in columns: | |
| col_data = df[col] | |
| n_missing = col_data.isnull().sum() | |
| missing_pct = 100 * n_missing / len(df) | |
| describe_vals = desc_all[col] if col in desc_all else {} | |
| # Detect type | |
| if pd.api.types.is_numeric_dtype(col_data): | |
| col_type = "Numerical" | |
| #elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated | |
| elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object: | |
| col_type = "Categorical" | |
| else: | |
| col_type = str(col_data.dtype) | |
| section = [f"--- {col} ({col_type}) ---"] | |
| # Report missing values (ALWAYS show if missing, even if all missing) | |
| section.append(f" Missing values: {n_missing} ({missing_pct:.1f}%)") | |
| if n_missing == len(df): | |
| section.append(" ⚠️ All values are missing.") | |
| out.extend(section) | |
| out.append("") | |
| continue | |
| n_unique = col_data.nunique(dropna=True) | |
| if n_unique == 0: | |
| section.append(" ⚠️ No unique (non-missing) values.") | |
| out.extend(section) | |
| out.append("") | |
| continue | |
| if n_unique == 1: | |
| section.append(" ⚠️ Only one unique value (constant column).") | |
| out.extend(section) | |
| out.append("") | |
| continue | |
| # Numerical | |
| if col_type == "Numerical": | |
| def safe_float(val): # clean up nans and missing | |
| try: | |
| return float(val) | |
| except Exception: | |
| return np.nan | |
| min_ = safe_float(describe_vals.get("min", np.nan)) | |
| q1 = safe_float(describe_vals.get("25%", np.nan)) | |
| median = safe_float(describe_vals.get("50%", np.nan)) | |
| q3 = safe_float(describe_vals.get("75%", np.nan)) | |
| max_ = safe_float(describe_vals.get("max", np.nan)) | |
| mean = safe_float(describe_vals.get("mean", np.nan)) | |
| std = safe_float(describe_vals.get("std", np.nan)) | |
| skew = col_data.skew() | |
| kurt = col_data.kurtosis() | |
| lines = [ | |
| f" 5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}", | |
| f" Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}", | |
| f" Range: {min_:.2f} to {max_:.2f}" | |
| ] | |
| if np.abs(mean - median) > 0.3 * std: | |
| lines.append(" ⚠️ Mean and median differ a lot (skew/outliers suspected)") | |
| if kurt > 3: | |
| lines.append(" ⚠️ High kurtosis (heavy tails/outliers)") | |
| section.extend(lines) | |
| # Categorical | |
| elif col_type == "Categorical": | |
| n_unique_desc = describe_vals.get("unique", None) | |
| section.append(f" Unique values: {n_unique} (describe: {n_unique_desc})") | |
| if n_unique <= 10: | |
| section.append(" ✅ Few unique values (good for grouping/one-hot encoding)") | |
| if n_unique > 100: | |
| section.append(" ⚠️ High cardinality (many unique values)") | |
| value_counts = col_data.value_counts(dropna=True) | |
| top = describe_vals.get("top", None) | |
| freq = describe_vals.get("freq", None) | |
| if not value_counts.empty: | |
| top_vals = value_counts.iloc[:max_cat_display] | |
| if use_tabulate: | |
| table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain") | |
| section.append(" Top values:\n" + "\n".join(" " + line for line in table.splitlines())) | |
| else: | |
| section.append(" Top values:") | |
| for val, count in top_vals.items(): | |
| section.append(f" {val} ({count})") | |
| if top is not None and freq is not None: | |
| section.append(f" Most frequent (describe): {top} ({freq} times)") | |
| out.extend(section) | |
| out.append("") # Blank line between columns | |
| out.append("=== Done ===") | |
| # Print to stdout | |
| print("\n".join(out)) | |
| # Optionally write to file | |
| if to_file is not None: | |
| with open(to_file, "w", encoding="utf-8") as f: | |
| f.write("\n".join(out)) | |
| if return_str: | |
| return "\n".join(out) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| DataFrame has 150 rows and 5 columns. | |
| === Quick EDA Summary === | |
| --- sepal_length (Numerical) --- | |
| Missing values: 0 (0.0%) | |
| 5-number summary: min=4.30, Q1=5.10, median=5.80, Q3=6.40, max=7.90 | |
| Mean=5.84, Std=0.83, Skew=0.31, Kurtosis=-0.55 | |
| Range: 4.30 to 7.90 | |
| --- sepal_width (Numerical) --- | |
| Missing values: 0 (0.0%) | |
| 5-number summary: min=2.00, Q1=2.80, median=3.00, Q3=3.30, max=4.40 | |
| Mean=3.05, Std=0.43, Skew=0.33, Kurtosis=0.29 | |
| Range: 2.00 to 4.40 | |
| --- petal_length (Numerical) --- | |
| Missing values: 0 (0.0%) | |
| 5-number summary: min=1.00, Q1=1.60, median=4.35, Q3=5.10, max=6.90 | |
| Mean=3.76, Std=1.76, Skew=-0.27, Kurtosis=-1.40 | |
| Range: 1.00 to 6.90 | |
| ⚠️ Mean and median differ a lot (skew/outliers suspected) | |
| --- petal_width (Numerical) --- | |
| Missing values: 0 (0.0%) | |
| 5-number summary: min=0.10, Q1=0.30, median=1.30, Q3=1.80, max=2.50 | |
| Mean=1.20, Std=0.76, Skew=-0.10, Kurtosis=-1.34 | |
| Range: 0.10 to 2.50 | |
| --- species (Categorical) --- | |
| Missing values: 0 (0.0%) | |
| Unique values: 3 (describe: 3) | |
| ✅ Few unique values (good for grouping/one-hot encoding) | |
| Top values: | |
| setosa (50) | |
| versicolor (50) | |
| virginica (50) | |
| Most frequent (describe): setosa (50 times) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment