"""
quick_eda_summary.py

A handy EDA (exploratory data analysis) summary function for pandas DataFrames.

Features:
- Prints and optionally writes out a rich summary of each column (numerical and categorical).
- Includes min, Q1, median, Q3, max, mean, std, skew, kurtosis, value counts, etc.
- Optionally pretty-prints using tabulate, and can write output to a text file.

How to use:
    quick_eda_summary(df, to_file="summary.txt")

Notes:
- Output file is overwritten if it already exists.
- The `datetime_is_numeric` argument for df.describe is only supported in pandas >=1.1.0.
  If you see a TypeError, remove this argument.
"""
import pandas as pd
import numpy as np

def quick_eda_summary(
    df,
    max_cols=30,
    max_cat_display=3,
    to_file=None,
    return_str=False
):
    try:
        from tabulate import tabulate
        use_tabulate = True
    except ImportError:
        use_tabulate = False

    out = []

    shape_msg = f"DataFrame has {df.shape[0]} rows and {df.shape[1]} columns."
    out.append(shape_msg)
    out.append("=== Quick EDA Summary ===\n")

    # Compute df.describe(include='all') ONCE for all types
    desc_all = df.describe(include="all")

    if df.shape[1] > max_cols:
        msg = f"⚠️ DataFrame has {df.shape[1]} columns. Only showing the first {max_cols} columns.\n"
        out.append(msg)
        columns = df.columns[:max_cols]
    else:
        columns = df.columns

    for col in columns:
        col_data = df[col]
        n_missing = col_data.isnull().sum()
        missing_pct = 100 * n_missing / len(df)
        describe_vals = desc_all[col] if col in desc_all else {}

        # Detect type
        if pd.api.types.is_numeric_dtype(col_data):
            col_type = "Numerical"
        #elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == object: this is deprecated
        elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == object:
            col_type = "Categorical"
        else:
            col_type = str(col_data.dtype)

        section = [f"--- {col} ({col_type}) ---"]

        # Report missing values (ALWAYS show if missing, even if all missing)
        
        section.append(f"  Missing values: {n_missing} ({missing_pct:.1f}%)")
        if n_missing == len(df):
            section.append("  ⚠️ All values are missing.")
            out.extend(section)
            out.append("")
            continue

        n_unique = col_data.nunique(dropna=True)
        if n_unique == 0:
            section.append("  ⚠️ No unique (non-missing) values.")
            out.extend(section)
            out.append("")
            continue
        if n_unique == 1:
            section.append("  ⚠️ Only one unique value (constant column).")
            out.extend(section)
            out.append("")
            continue

        # Numerical
        if col_type == "Numerical":
            def safe_float(val):  # clean up nans and missing
                try:
                    return float(val)
                except Exception:
                    return np.nan
            min_ = safe_float(describe_vals.get("min", np.nan))
            q1 = safe_float(describe_vals.get("25%", np.nan))
            median = safe_float(describe_vals.get("50%", np.nan))
            q3 = safe_float(describe_vals.get("75%", np.nan))
            max_ = safe_float(describe_vals.get("max", np.nan))
            mean = safe_float(describe_vals.get("mean", np.nan))
            std = safe_float(describe_vals.get("std", np.nan))
            skew = col_data.skew()
            kurt = col_data.kurtosis()
            lines = [
                f"  5-number summary: min={min_:.2f}, Q1={q1:.2f}, median={median:.2f}, Q3={q3:.2f}, max={max_:.2f}",
                f"  Mean={mean:.2f}, Std={std:.2f}, Skew={skew:.2f}, Kurtosis={kurt:.2f}",
                f"  Range: {min_:.2f} to {max_:.2f}"
            ]
            if np.abs(mean - median) > 0.3 * std:
                lines.append("  ⚠️ Mean and median differ a lot (skew/outliers suspected)")
            if kurt > 3:
                lines.append("  ⚠️ High kurtosis (heavy tails/outliers)")
            section.extend(lines)

        # Categorical
        elif col_type == "Categorical":
            n_unique_desc = describe_vals.get("unique", None)
            section.append(f"  Unique values: {n_unique} (describe: {n_unique_desc})")
            if n_unique <= 10:
                section.append("  ✅ Few unique values (good for grouping/one-hot encoding)")
            if n_unique > 100:
                section.append("  ⚠️ High cardinality (many unique values)")
            value_counts = col_data.value_counts(dropna=True)
            top = describe_vals.get("top", None)
            freq = describe_vals.get("freq", None)
            if not value_counts.empty:
                top_vals = value_counts.iloc[:max_cat_display]
                if use_tabulate:
                    table = tabulate(list(zip(top_vals.index, top_vals.values)), headers=["Value", "Count"], tablefmt="plain")
                    section.append("  Top values:\n" + "\n".join("    " + line for line in table.splitlines()))
                else:
                    section.append("  Top values:")
                    for val, count in top_vals.items():
                        section.append(f"    {val} ({count})")
                if top is not None and freq is not None:
                    section.append(f"  Most frequent (describe): {top} ({freq} times)")

        out.extend(section)
        out.append("")  # Blank line between columns

    out.append("=== Done ===")

    # Print to stdout
    print("\n".join(out))

    # Optionally write to file
    if to_file is not None:
        with open(to_file, "w", encoding="utf-8") as f:
            f.write("\n".join(out))

    if return_str:
        return "\n".join(out)