quantumdolphin · July 31, 2025 01:34
diff --git a/rich_eda_report.py b/rich_eda_report.py
 """
 quick_eda_summary.py

 A handy EDA (exploratory data analysis) summary function for pandas DataFrames.

 Features:
 - Prints and optionally writes out a rich summary of each column (numerical and categorical).
 - Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc.
 - Optionally pretty-prints using tabulate, and can write output to a text file.

 How to use:
    quick_eda_summary(df, to_file="summary.txt")

 Notes:
 - Output file is overwritten if it already exists.
 - The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0.
  If you see a TypeError, remove this argument.
 """
 import pandas as pd
 import numpy as np

 def quick_eda_summary(
    df,
    max_cols=30,
    max_cat_display=3,
    to_file=None,
    return_str=False
 ):
    try:
        from tabulate import tabulate
        use_tabulate = True
    except ImportError:
        use_tabulate = False

    out = []

    shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns."
    out.append(shape_msg)
    out.append("=== Quick EDA Summary ===\n")

    # Compute df.describe(include='all') ONCE for all types
    desc_all = df.describe(include="all")

    if df.shape[1] > max_cols:
        msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n"
        out.append(msg)
        columns = df.columns[:max_cols]
    else:
        columns = df.columns

    for col in columns:
        col_data = df[col]
        n_missing = col_data.isnull().sum()
        missing_pct = 100 * n_missing / len(df)
        describe_vals = desc_all[col] if col in desc_all else {}

        # Detect type
        if pd.api.types.is_numeric_dtype(col_data):
            col_type = "Numerical"
        #elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated
        elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object:
            col_type = "Categorical"
        else:
            col_type = str(col_data.dtype)

        section = [f"--- {col} ({col_type}) ---"]

        # Report missing values (ALWAYS show if missing, even if all missing)
        
        section.append(f"  Missing values: {n_missing} ({missing_pct:.1f}%)")
        if n_missing == len(df):
            section.append("  ⚠️ All values are missing.")
            out.extend(section)
            out.append("")
            continue

        n_unique = col_data.nunique(dropna=True)
        if n_unique == 0:
            section.append("  ⚠️ No unique (non-missing) values.")
            out.extend(section)
            out.append("")
            continue
        if n_unique == 1:
            section.append("  ⚠️ Only one unique value (constant column).")
            out.extend(section)
            out.append("")
            continue

        # Numerical
        if col_type == "Numerical":
            def safe_float(val):  # clean up nans and missing
                try:
                    return float(val)
                except Exception:
                    return np.nan
            min_ = safe_float(describe_vals.get("min", np.nan))
            q1 = safe_float(describe_vals.get("25%", np.nan))
            median = safe_float(describe_vals.get("50%", np.nan))
            q3 = safe_float(describe_vals.get("75%", np.nan))
            max_ = safe_float(describe_vals.get("max", np.nan))
            mean = safe_float(describe_vals.get("mean", np.nan))
            std = safe_float(describe_vals.get("std", np.nan))
            skew = col_data.skew()
            kurt = col_data.kurtosis()
            lines = [
                f"  5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}",
                f"  Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}",
                f"  Range: {min_:.2f} to {max_:.2f}"
            ]
            if np.abs(mean - median) > 0.3 * std:
                lines.append("  ⚠️ Mean and median differ a lot (skew/outliers suspected)")
            if kurt > 3:
                lines.append("  ⚠️ High kurtosis (heavy tails/outliers)")
            section.extend(lines)

        # Categorical
        elif col_type == "Categorical":
            n_unique_desc = describe_vals.get("unique", None)
            section.append(f"  Unique values: {n_unique} (describe: {n_unique_desc})")
            if n_unique <= 10:
                section.append("  ✅ Few unique values (good for grouping/one-hot encoding)")
            if n_unique > 100:
                section.append("  ⚠️ High cardinality (many unique values)")
            value_counts = col_data.value_counts(dropna=True)
            top = describe_vals.get("top", None)
            freq = describe_vals.get("freq", None)
            if not value_counts.empty:
                top_vals = value_counts.iloc[:max_cat_display]
                if use_tabulate:
                    table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain")
                    section.append("  Top values:\n" + "\n".join("    " + line for line in table.splitlines()))
                else:
                    section.append("  Top values:")
                    for val, count in top_vals.items():
                        section.append(f"    {val} ({count})")
                if top is not None and freq is not None:
                    section.append(f"  Most frequent (describe): {top} ({freq} times)")

        out.extend(section)
        out.append("")  # Blank line between columns

    out.append("=== Done ===")

    # Print to stdout
    print("\n".join(out))

    # Optionally write to file
    if to_file is not None:
        with open(to_file, "w", encoding="utf-8") as f:
            f.write("\n".join(out))

    if return_str:
        return "\n".join(out)
diff --git a/sample-output.txt b/sample-output.txt
 DataFrame has 150 rows and 5 columns.
 === Quick EDA Summary ===

 --- sepal_length (Numerical) ---
  Missing values: 0 (0.0%)
  5-number summary: min=4.30, Q1=5.10, median=5.80, Q3=6.40, max=7.90
  Mean=5.84, Std=0.83, Skew=0.31, Kurtosis=-0.55
  Range: 4.30 to 7.90

 --- sepal_width (Numerical) ---
  Missing values: 0 (0.0%)
  5-number summary: min=2.00, Q1=2.80, median=3.00, Q3=3.30, max=4.40
  Mean=3.05, Std=0.43, Skew=0.33, Kurtosis=0.29
  Range: 2.00 to 4.40

 --- petal_length (Numerical) ---
  Missing values: 0 (0.0%)
  5-number summary: min=1.00, Q1=1.60, median=4.35, Q3=5.10, max=6.90
  Mean=3.76, Std=1.76, Skew=-0.27, Kurtosis=-1.40
  Range: 1.00 to 6.90
  ⚠️ Mean and median differ a lot (skew/outliers suspected)

 --- petal_width (Numerical) ---
  Missing values: 0 (0.0%)
  5-number summary: min=0.10, Q1=0.30, median=1.30, Q3=1.80, max=2.50
  Mean=1.20, Std=0.76, Skew=-0.10, Kurtosis=-1.34
  Range: 0.10 to 2.50

 --- species (Categorical) ---
  Missing values: 0 (0.0%)
  Unique values: 3 (describe: 3)
  ✅ Few unique values (good for grouping/one-hot encoding)
  Top values:
    setosa (50)
    versicolor (50)
    virginica (50)
  Most frequent (describe): setosa (50 times)
	"""
	quick_eda_summary.py

	A handy EDA (exploratory data analysis) summary function for pandas DataFrames.

	Features:
	- Prints and optionally writes out a rich summary of each column (numerical and categorical).
	- Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc.
	- Optionally pretty-prints using tabulate, and can write output to a text file.

	How to use:
	quick_eda_summary(df, to_file="summary.txt")

	Notes:
	- Output file is overwritten if it already exists.
	- The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0.
	If you see a TypeError, remove this argument.
	"""
	import pandas as pd
	import numpy as np

	def quick_eda_summary(
	df,
	max_cols=30,
	max_cat_display=3,
	to_file=None,
	return_str=False
	):
	try:
	from tabulate import tabulate
	use_tabulate = True
	except ImportError:
	use_tabulate = False

	out = []

	shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns."
	out.append(shape_msg)
	out.append("=== Quick EDA Summary ===\n")

	# Compute df.describe(include='all') ONCE for all types
	desc_all = df.describe(include="all")

	if df.shape[1] > max_cols:
	msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n"
	out.append(msg)
	columns = df.columns[:max_cols]
	else:
	columns = df.columns

	for col in columns:
	col_data = df[col]
	n_missing = col_data.isnull().sum()
	missing_pct = 100 * n_missing / len(df)
	describe_vals = desc_all[col] if col in desc_all else {}

	# Detect type
	if pd.api.types.is_numeric_dtype(col_data):
	col_type = "Numerical"
	#elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated
	elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object:
	col_type = "Categorical"
	else:
	col_type = str(col_data.dtype)

	section = [f"--- {col} ({col_type}) ---"]

	# Report missing values (ALWAYS show if missing, even if all missing)

	section.append(f" Missing values: {n_missing} ({missing_pct:.1f}%)")
	if n_missing == len(df):
	section.append(" ⚠️ All values are missing.")
	out.extend(section)
	out.append("")
	continue

	n_unique = col_data.nunique(dropna=True)
	if n_unique == 0:
	section.append(" ⚠️ No unique (non-missing) values.")
	out.extend(section)
	out.append("")
	continue
	if n_unique == 1:
	section.append(" ⚠️ Only one unique value (constant column).")
	out.extend(section)
	out.append("")
	continue

	# Numerical
	if col_type == "Numerical":
	def safe_float(val): # clean up nans and missing
	try:
	return float(val)
	except Exception:
	return np.nan
	min_ = safe_float(describe_vals.get("min", np.nan))
	q1 = safe_float(describe_vals.get("25%", np.nan))
	median = safe_float(describe_vals.get("50%", np.nan))
	q3 = safe_float(describe_vals.get("75%", np.nan))
	max_ = safe_float(describe_vals.get("max", np.nan))
	mean = safe_float(describe_vals.get("mean", np.nan))
	std = safe_float(describe_vals.get("std", np.nan))
	skew = col_data.skew()
	kurt = col_data.kurtosis()
	lines = [
	f" 5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}",
	f" Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}",
	f" Range: {min_:.2f} to {max_:.2f}"
	]
	if np.abs(mean - median) > 0.3 * std:
	lines.append(" ⚠️ Mean and median differ a lot (skew/outliers suspected)")
	if kurt > 3:
	lines.append(" ⚠️ High kurtosis (heavy tails/outliers)")
	section.extend(lines)

	# Categorical
	elif col_type == "Categorical":
	n_unique_desc = describe_vals.get("unique", None)
	section.append(f" Unique values: {n_unique} (describe: {n_unique_desc})")
	if n_unique <= 10:
	section.append(" ✅ Few unique values (good for grouping/one-hot encoding)")
	if n_unique > 100:
	section.append(" ⚠️ High cardinality (many unique values)")
	value_counts = col_data.value_counts(dropna=True)
	top = describe_vals.get("top", None)
	freq = describe_vals.get("freq", None)
	if not value_counts.empty:
	top_vals = value_counts.iloc[:max_cat_display]
	if use_tabulate:
	table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain")
	section.append(" Top values:\n" + "\n".join(" " + line for line in table.splitlines()))
	else:
	section.append(" Top values:")
	for val, count in top_vals.items():
	section.append(f" {val} ({count})")
	if top is not None and freq is not None:
	section.append(f" Most frequent (describe): {top} ({freq} times)")

	out.extend(section)
	out.append("") # Blank line between columns

	out.append("=== Done ===")

	# Print to stdout
	print("\n".join(out))

	# Optionally write to file
	if to_file is not None:
	with open(to_file, "w", encoding="utf-8") as f:
	f.write("\n".join(out))

	if return_str:
	return "\n".join(out)
	DataFrame has 150 rows and 5 columns.
	=== Quick EDA Summary ===

	--- sepal_length (Numerical) ---
	Missing values: 0 (0.0%)
	5-number summary: min=4.30, Q1=5.10, median=5.80, Q3=6.40, max=7.90
	Mean=5.84, Std=0.83, Skew=0.31, Kurtosis=-0.55
	Range: 4.30 to 7.90

	--- sepal_width (Numerical) ---
	Missing values: 0 (0.0%)
	5-number summary: min=2.00, Q1=2.80, median=3.00, Q3=3.30, max=4.40
	Mean=3.05, Std=0.43, Skew=0.33, Kurtosis=0.29
	Range: 2.00 to 4.40

	--- petal_length (Numerical) ---
	Missing values: 0 (0.0%)
	5-number summary: min=1.00, Q1=1.60, median=4.35, Q3=5.10, max=6.90
	Mean=3.76, Std=1.76, Skew=-0.27, Kurtosis=-1.40
	Range: 1.00 to 6.90
	⚠️ Mean and median differ a lot (skew/outliers suspected)

	--- petal_width (Numerical) ---
	Missing values: 0 (0.0%)
	5-number summary: min=0.10, Q1=0.30, median=1.30, Q3=1.80, max=2.50
	Mean=1.20, Std=0.76, Skew=-0.10, Kurtosis=-1.34
	Range: 0.10 to 2.50

	--- species (Categorical) ---
	Missing values: 0 (0.0%)
	Unique values: 3 (describe: 3)
	✅ Few unique values (good for grouping/one-hot encoding)
	Top values:
	setosa (50)
	versicolor (50)
	virginica (50)
	Most frequent (describe): setosa (50 times)