aimnemy · September 17, 2025 12:09 · Sep 17, 2025
diff --git a/eda_polars_template.ipynb b/eda_polars_template.ipynb
@@ -0,0 +1,866 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prerequisite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from typing import Optional, Dict, Any, Union, List\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Exploration & Analysis with Polars\n",
+    "\n",
+    "Lean and efficient EDA template using Polars for fast data processing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Function & Useful"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_as_text_to_polars(\n",
+    "    file_path: Union[str, Path],\n",
+    "    sheet_name: Optional[Union[str, int]] = 0,\n",
+    "    encoding: str = \"utf-8\",\n",
+    "    fallback_encoding: str = \"latin-1\",\n",
+    "    skip_rows: int = 0,\n",
+    "    columns_to_use: Optional[List[str]] = None,\n",
+    "    clean_nulls: bool = True,\n",
+    "    **kwargs,\n",
+    ") -> pl.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Read various file formats into Polars DataFrame keeping all data as text/string.\n",
+    "    Preserves original formatting without type conversion.\n",
+    "\n",
+    "    Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n",
+    "    \"\"\"\n",
+    "    file_path = Path(file_path)\n",
+    "    extension = file_path.suffix.lower()\n",
+    "\n",
+    "    def force_text_types(df: pl.DataFrame) -> pl.DataFrame:\n",
+    "        \"\"\"Convert all columns to UTF8/String type\"\"\"\n",
+    "        for col in df.columns:\n",
+    "            if df[col].dtype != pl.Utf8:\n",
+    "                df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))\n",
+    "        return df\n",
+    "\n",
+    "    def clean_text_values(df: pl.DataFrame) -> pl.DataFrame:\n",
+    "        \"\"\"Clean text values while preserving format\"\"\"\n",
+    "        null_values = [\"nan\", \"NaN\", \"NA\", \"N/A\", \"null\", \"NULL\", \"None\", \"NONE\", \"\"]\n",
+    "\n",
+    "        for col in df.columns:\n",
+    "            # Trim whitespace first\n",
+    "            df = df.with_columns(pl.col(col).str.strip_chars().alias(col))\n",
+    "\n",
+    "            # Replace null-like values with actual null\n",
+    "            df = df.with_columns(\n",
+    "                pl.when(pl.col(col).is_in(null_values))\n",
+    "                .then(None)\n",
+    "                .otherwise(pl.col(col))\n",
+    "                .alias(col)\n",
+    "            )\n",
+    "\n",
+    "        return df\n",
+    "\n",
+    "    try:\n",
+    "        # CSV/TSV\n",
+    "        if extension in [\".csv\", \".tsv\"]:\n",
+    "            separator = \"\\t\" if extension == \".tsv\" else \",\"\n",
+    "            try:\n",
+    "                # Force all columns to be read as UTF8\n",
+    "                df = pl.read_csv(\n",
+    "                    file_path,\n",
+    "                    separator=separator,\n",
+    "                    encoding=encoding,\n",
+    "                    skip_rows=skip_rows,\n",
+    "                    dtypes=pl.Utf8,  # Force all columns as text\n",
+    "                    infer_schema_length=0,  # Disable type inference\n",
+    "                    ignore_errors=True,\n",
+    "                    try_parse_dates=False,  # Don't parse dates\n",
+    "                    null_values=[\"\"],  # Only treat empty string as null\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "            except UnicodeDecodeError:\n",
+    "                # Fallback encoding\n",
+    "                df = pl.read_csv(\n",
+    "                    file_path,\n",
+    "                    separator=separator,\n",
+    "                    encoding=fallback_encoding,\n",
+    "                    skip_rows=skip_rows,\n",
+    "                    dtypes=pl.Utf8,\n",
+    "                    infer_schema_length=0,\n",
+    "                    ignore_errors=True,\n",
+    "                    try_parse_dates=False,\n",
+    "                    null_values=[\"\"],\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "            except Exception as csv_error:\n",
+    "                # Try pandas fallback\n",
+    "                pd_df = pd.read_csv(\n",
+    "                    file_path,\n",
+    "                    sep=separator,\n",
+    "                    encoding=encoding,\n",
+    "                    dtype=str,\n",
+    "                    na_filter=False,\n",
+    "                    skiprows=skip_rows,\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "                df = pl.from_pandas(pd_df)\n",
+    "                df = force_text_types(df)\n",
+    "\n",
+    "        # Excel formats\n",
+    "        elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n",
+    "            try:\n",
+    "                # Try Polars native reader first\n",
+    "                df = pl.read_excel(\n",
+    "                    file_path, sheet_name=sheet_name, infer_schema_length=0, **kwargs\n",
+    "                )\n",
+    "                df = force_text_types(df)\n",
+    "            except Exception:\n",
+    "                # Fallback to pandas\n",
+    "                try:\n",
+    "                    pd_df = pd.read_excel(\n",
+    "                        file_path,\n",
+    "                        sheet_name=sheet_name\n",
+    "                        if isinstance(sheet_name, (str, int))\n",
+    "                        else 0,\n",
+    "                        dtype=str,  # Force all columns as string\n",
+    "                        na_filter=False,  # Don't convert to NaN\n",
+    "                        skiprows=skip_rows,\n",
+    "                        **kwargs,\n",
+    "                    )\n",
+    "                    df = pl.from_pandas(pd_df)\n",
+    "                    df = force_text_types(df)\n",
+    "                except Exception as excel_error:\n",
+    "                    raise ValueError(f\"Cannot read Excel file: {excel_error}\")\n",
+    "\n",
+    "        # Parquet\n",
+    "        elif extension == \".parquet\":\n",
+    "            df = pl.read_parquet(file_path, **kwargs)\n",
+    "            df = force_text_types(df)\n",
+    "\n",
+    "        # JSON/JSONL\n",
+    "        elif extension in [\".json\", \".jsonl\"]:\n",
+    "            if extension == \".jsonl\":\n",
+    "                df = pl.read_ndjson(file_path, infer_schema_length=0, **kwargs)\n",
+    "            else:\n",
+    "                df = pl.read_json(file_path, **kwargs)\n",
+    "            df = force_text_types(df)\n",
+    "\n",
+    "        # Avro\n",
+    "        elif extension == \".avro\":\n",
+    "            df = pl.read_avro(file_path, **kwargs)\n",
+    "            df = force_text_types(df)\n",
+    "\n",
+    "        # Feather/Arrow\n",
+    "        elif extension in [\".feather\", \".arrow\"]:\n",
+    "            df = pl.read_ipc(file_path, **kwargs)\n",
+    "            df = force_text_types(df)\n",
+    "\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported file format: {extension}\")\n",
+    "\n",
+    "        # Clean column names (snake_case)\n",
+    "        clean_columns = {}\n",
+    "        for col in df.columns:\n",
+    "            # Handle empty column names\n",
+    "            if col == \"\" or col is None:\n",
+    "                new_col = f\"column_{len(clean_columns)}\"\n",
+    "            else:\n",
+    "                new_col = (\n",
+    "                    col.strip()\n",
+    "                    .replace(\" \", \"_\")\n",
+    "                    .replace(\"-\", \"_\")\n",
+    "                    .replace(\".\", \"_\")\n",
+    "                    .lower()\n",
+    "                )\n",
+    "            # Handle duplicate column names\n",
+    "            if new_col in clean_columns.values():\n",
+    "                new_col = f\"{new_col}_{len(clean_columns)}\"\n",
+    "            clean_columns[col] = new_col\n",
+    "\n",
+    "        df = df.rename(clean_columns)\n",
+    "\n",
+    "        # Optional: Clean null-like text values\n",
+    "        if clean_nulls:\n",
+    "            df = clean_text_values(df)\n",
+    "\n",
+    "        # Remove completely empty rows\n",
+    "        df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n",
+    "\n",
+    "        # Remove completely empty columns\n",
+    "        non_empty_cols = [col for col in df.columns if not df[col].is_null().all()]\n",
+    "        if non_empty_cols:\n",
+    "            df = df.select(non_empty_cols)\n",
+    "\n",
+    "        # Filter columns if specified\n",
+    "        if columns_to_use:\n",
+    "            available_cols = [col for col in columns_to_use if col in df.columns]\n",
+    "            if available_cols:\n",
+    "                df = df.select(available_cols)\n",
+    "\n",
+    "        return df\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        raise ValueError(f\"Failed to read file '{file_path}': {str(e)}\")\n",
+    "\n",
+    "\n",
+    "def read_preserve_format(\n",
+    "    file_path: Union[str, Path],\n",
+    "    preserve_leading_zeros: bool = True,\n",
+    "    preserve_decimal_places: bool = True,\n",
+    "    clean_nulls: bool = False,  # Default False to preserve all text\n",
+    "    **kwargs,\n",
+    ") -> pl.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Enhanced version that preserves formatting like leading zeros and decimal places.\n",
+    "    \"\"\"\n",
+    "    df = read_as_text_to_polars(file_path, clean_nulls=clean_nulls, **kwargs)\n",
+    "\n",
+    "    if not preserve_leading_zeros:\n",
+    "        # Remove leading zeros only from numeric-looking strings\n",
+    "        for col in df.columns:\n",
+    "            df = df.with_columns(\n",
+    "                pl.when(pl.col(col).str.contains(\"^0+[1-9]\\\\d*$\"))\n",
+    "                .then(pl.col(col).str.lstrip(\"0\"))\n",
+    "                .otherwise(pl.col(col))\n",
+    "                .alias(col)\n",
+    "            )\n",
+    "\n",
+    "    if not preserve_decimal_places:\n",
+    "        # Remove trailing zeros after decimal\n",
+    "        for col in df.columns:\n",
+    "            df = df.with_columns(\n",
+    "                pl.when(pl.col(col).str.contains(\"\\\\d\\\\.\\\\d*0+$\"))\n",
+    "                .then(\n",
+    "                    pl.col(col).str.strip_suffix_chars(\"0\").str.strip_suffix_chars(\".\")\n",
+    "                )\n",
+    "                .otherwise(pl.col(col))\n",
+    "                .alias(col)\n",
+    "            )\n",
+    "\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "# Quick helper function for simple reading\n",
+    "def quick_read_text(file_path: Union[str, Path]) -> pl.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Simplified function to quickly read any file as text without any processing.\n",
+    "    \"\"\"\n",
+    "    return read_as_text_to_polars(\n",
+    "        file_path,\n",
+    "        clean_nulls=False,  # Keep everything as-is\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "# Setup function\n",
+    "def setup_environment():\n",
+    "    \"\"\"Install required packages\"\"\"\n",
+    "    import subprocess\n",
+    "    import sys\n",
+    "\n",
+    "    packages = [\"polars>=0.20.0\", \"pandas>=2.0.0\", \"openpyxl\", \"xlrd\", \"pyarrow\"]\n",
+    "    for package in packages:\n",
+    "        subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n",
+    "\n",
+    "\n",
+    "# # Example usage\n",
+    "# if __name__ == \"__main__\":\n",
+    "#     # Simple read - preserves everything\n",
+    "#     df = quick_read_text(\"data.xlsx\")\n",
+    "\n",
+    "#     # Read with options\n",
+    "#     df = read_as_text_to_polars(\n",
+    "#         \"data.xlsx\",\n",
+    "#         sheet_name=0,\n",
+    "#         clean_nulls=False  # Keep \"NA\", \"null\" as text\n",
+    "#     )\n",
+    "\n",
+    "#     # Read CSV\n",
+    "#     df_csv = read_as_text_to_polars(\n",
+    "#         \"data.csv\",\n",
+    "#         encoding='utf-8',\n",
+    "#         clean_nulls=True  # Convert \"NA\", \"null\" to None\n",
+    "#     )\n",
+    "\n",
+    "#     # Preserve special formatting\n",
+    "#     df_formatted = read_preserve_format(\n",
+    "#         \"data.xlsx\",\n",
+    "#         preserve_leading_zeros=True,  # \"001\" stays \"001\"\n",
+    "#         preserve_decimal_places=True,  # \"1.00\" stays \"1.00\"\n",
+    "#         clean_nulls=False\n",
+    "#     )\n",
+    "\n",
+    "#     print(f\"Shape: {df.shape}\")\n",
+    "#     print(f\"Columns: {df.columns}\")\n",
+    "#     print(f\"Dtypes: {df.dtypes}\")  # All pl.Utf8\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Data Loading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "from typing import Optional, Dict, Any, Union\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "\n",
+    "def read_flexible_to_polars(\n",
+    "    file_path: Union[str, Path],\n",
+    "    sheet_name: Optional[Union[str, int]] = 0,\n",
+    "    encoding: str = \"utf-8\",\n",
+    "    fallback_encoding: str = \"latin-1\",\n",
+    "    infer_schema_length: int = 10000,\n",
+    "    **kwargs,\n",
+    ") -> pl.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Read various file formats into Polars DataFrame with dynamic datatype handling.\n",
+    "\n",
+    "    Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n",
+    "    \"\"\"\n",
+    "    file_path = Path(file_path)\n",
+    "    extension = file_path.suffix.lower()\n",
+    "\n",
+    "    def safe_type_convert(df: pl.DataFrame) -> pl.DataFrame:\n",
+    "        for col in df.columns:\n",
+    "            try:\n",
+    "                if df[col].dtype == pl.Utf8:\n",
+    "                    # Try numeric conversion\n",
+    "                    try:\n",
+    "                        temp = df[col].str.replace_all(\",\", \"\").str.strip_chars()\n",
+    "                        if temp.str.contains(\"^-?\\\\d+$\").all():\n",
+    "                            df = df.with_columns(temp.cast(pl.Int64).alias(col))\n",
+    "                        elif temp.str.contains(\"^-?\\\\d*\\\\.?\\\\d+$\").all():\n",
+    "                            df = df.with_columns(temp.cast(pl.Float64).alias(col))\n",
+    "                    except:\n",
+    "                        pass\n",
+    "\n",
+    "                    # Try date conversion\n",
+    "                    try:\n",
+    "                        if df[col].str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}\").any():\n",
+    "                            df = df.with_columns(\n",
+    "                                pl.col(col)\n",
+    "                                .str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n",
+    "                                .alias(col)\n",
+    "                            )\n",
+    "                    except:\n",
+    "                        pass\n",
+    "\n",
+    "                    # Try datetime conversion\n",
+    "                    try:\n",
+    "                        if (\n",
+    "                            df[col]\n",
+    "                            .str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}.*\\\\d{2}:\\\\d{2}\")\n",
+    "                            .any()\n",
+    "                        ):\n",
+    "                            df = df.with_columns(\n",
+    "                                pl.col(col)\n",
+    "                                .str.strptime(\n",
+    "                                    pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False\n",
+    "                                )\n",
+    "                                .alias(col)\n",
+    "                            )\n",
+    "                    except:\n",
+    "                        pass\n",
+    "\n",
+    "                    # Try boolean conversion\n",
+    "                    unique_vals = df[col].unique().drop_nulls()\n",
+    "                    if len(unique_vals) <= 2:\n",
+    "                        lower_vals = unique_vals.str.to_lowercase()\n",
+    "                        if set(lower_vals) <= {\n",
+    "                            \"true\",\n",
+    "                            \"false\",\n",
+    "                            \"yes\",\n",
+    "                            \"no\",\n",
+    "                            \"1\",\n",
+    "                            \"0\",\n",
+    "                            \"y\",\n",
+    "                            \"n\",\n",
+    "                        }:\n",
+    "                            mapping = {\n",
+    "                                \"true\": True,\n",
+    "                                \"false\": False,\n",
+    "                                \"yes\": True,\n",
+    "                                \"no\": False,\n",
+    "                                \"1\": True,\n",
+    "                                \"0\": False,\n",
+    "                                \"y\": True,\n",
+    "                                \"n\": False,\n",
+    "                            }\n",
+    "                            df = df.with_columns(\n",
+    "                                pl.col(col)\n",
+    "                                .str.to_lowercase()\n",
+    "                                .map_elements(\n",
+    "                                    lambda x: mapping.get(x, None),\n",
+    "                                    return_dtype=pl.Boolean,\n",
+    "                                )\n",
+    "                                .alias(col)\n",
+    "                            )\n",
+    "            except:\n",
+    "                continue\n",
+    "        return df\n",
+    "\n",
+    "    try:\n",
+    "        # CSV/TSV\n",
+    "        if extension in [\".csv\", \".tsv\"]:\n",
+    "            separator = \"\\t\" if extension == \".tsv\" else \",\"\n",
+    "            try:\n",
+    "                df = pl.read_csv(\n",
+    "                    file_path,\n",
+    "                    separator=separator,\n",
+    "                    encoding=encoding,\n",
+    "                    infer_schema_length=infer_schema_length,\n",
+    "                    ignore_errors=True,\n",
+    "                    try_parse_dates=True,\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "            except:\n",
+    "                df = pl.read_csv(\n",
+    "                    file_path,\n",
+    "                    separator=separator,\n",
+    "                    encoding=fallback_encoding,\n",
+    "                    infer_schema_length=None,\n",
+    "                    ignore_errors=True,\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "\n",
+    "        # Excel formats\n",
+    "        elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n",
+    "            try:\n",
+    "                df = pl.read_excel(\n",
+    "                    file_path,\n",
+    "                    sheet_name=sheet_name,\n",
+    "                    infer_schema_length=infer_schema_length,\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "            except:\n",
+    "                # Fallback to pandas\n",
+    "                pd_df = pd.read_excel(\n",
+    "                    file_path,\n",
+    "                    sheet_name=sheet_name if isinstance(sheet_name, (str, int)) else 0,\n",
+    "                    **kwargs,\n",
+    "                )\n",
+    "                pd_df = pd_df.astype(str).replace([\"nan\", \"NaN\", \"None\"], None)\n",
+    "                df = pl.from_pandas(pd_df)\n",
+    "\n",
+    "        # Parquet\n",
+    "        elif extension == \".parquet\":\n",
+    "            df = pl.read_parquet(file_path, **kwargs)\n",
+    "\n",
+    "        # JSON/JSONL\n",
+    "        elif extension in [\".json\", \".jsonl\"]:\n",
+    "            if extension == \".jsonl\":\n",
+    "                df = pl.read_ndjson(\n",
+    "                    file_path, infer_schema_length=infer_schema_length, **kwargs\n",
+    "                )\n",
+    "            else:\n",
+    "                df = pl.read_json(file_path, **kwargs)\n",
+    "\n",
+    "        # Avro\n",
+    "        elif extension == \".avro\":\n",
+    "            df = pl.read_avro(file_path, **kwargs)\n",
+    "\n",
+    "        # Feather/Arrow\n",
+    "        elif extension in [\".feather\", \".arrow\"]:\n",
+    "            df = pl.read_ipc(file_path, **kwargs)\n",
+    "\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported file format: {extension}\")\n",
+    "\n",
+    "        # Clean column names\n",
+    "        df = df.rename(\n",
+    "            {col: col.strip().replace(\" \", \"_\").lower() for col in df.columns}\n",
+    "        )\n",
+    "\n",
+    "        # Apply smart type conversion\n",
+    "        df = safe_type_convert(df)\n",
+    "\n",
+    "        # Remove completely empty rows and columns\n",
+    "        df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n",
+    "        df = df.select([col for col in df.columns if not df[col].is_null().all()])\n",
+    "\n",
+    "        return df\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        # Ultimate fallback: read as string and convert\n",
+    "        try:\n",
+    "            if extension in [\".csv\", \".tsv\"]:\n",
+    "                df = pl.read_csv(\n",
+    "                    file_path,\n",
+    "                    separator=\"\\t\" if extension == \".tsv\" else \",\",\n",
+    "                    encoding=\"utf-8\",\n",
+    "                    infer_schema_length=None,\n",
+    "                    dtypes={col: pl.Utf8 for col in range(100)},\n",
+    "                    ignore_errors=True,\n",
+    "                    has_header=True,\n",
+    "                )\n",
+    "                return safe_type_convert(df)\n",
+    "        except:\n",
+    "            pass\n",
+    "        raise ValueError(f\"Failed to read file: {e}\")\n",
+    "\n",
+    "\n",
+    "# Usage setup\n",
+    "def setup_environment():\n",
+    "    \"\"\"Install required packages\"\"\"\n",
+    "    import subprocess\n",
+    "    import sys\n",
+    "\n",
+    "    packages = [\"polars\", \"pandas\", \"openpyxl\", \"xlrd\", \"pyarrow\", \"fastparquet\"]\n",
+    "    for package in packages:\n",
+    "        subprocess.check_call(\n",
+    "            [sys.executable, \"-m\", \"uv pip\", \"install\", \"-q\", package]\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "# # Load data using the robust function\n",
+    "# file_path = \"input/202508_new_checking_with_final_output_analysis_with_ruleV2_asof_20250915_v3.xlsx\"\n",
+    "\n",
+    "# # Read any supported file\n",
+    "# main_vatwht_df = read_flexible_to_polars(file_path, sheet_name=0)\n",
+    "\n",
+    "# # With specific encoding for CSV\n",
+    "# main_vatwht_df = read_flexible_to_polars(file_path, encoding=\"utf-8\")\n",
+    "\n",
+    "# # With additional parameters\n",
+    "# main_vatwht_df = read_flexible_to_polars(\n",
+    "#     file_path, sheet_name=\"Sheet1\", infer_schema_length=50000\n",
+    "# )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Data Overview"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Basic info\n",
+    "print(\"=== Dataset Information ===\")\n",
+    "print(f\"Shape: {df.shape}\")\n",
+    "print(f\"Columns: {df.columns}\")\n",
+    "print(\"\\n=== Data Types ===\")\n",
+    "print(df.dtypes)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Schema overview\n",
+    "df.schema\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Statistical summary\n",
+    "df.describe()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Data Quality Assessment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for missing values\n",
+    "missing_info = df.null_count()\n",
+    "missing_pct = (missing_info / len(df) * 100).round(2)\n",
+    "\n",
+    "quality_summary = pl.DataFrame(\n",
+    "    {\n",
+    "        \"column\": df.columns,\n",
+    "        \"null_count\": missing_info.row(0),\n",
+    "        \"null_percentage\": missing_pct.row(0),\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"=== Missing Values Summary ===\")\n",
+    "quality_summary.filter(pl.col(\"null_count\") > 0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for duplicates\n",
+    "duplicate_count = df.height - df.unique().height\n",
+    "print(f\"Duplicate rows: {duplicate_count}\")\n",
+    "print(f\"Duplicate percentage: {(duplicate_count / df.height * 100):.2f}%\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Exploratory Data Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select numeric columns for analysis\n",
+    "numeric_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns\n",
+    "categorical_cols = df.select(pl.col(pl.STRING_DTYPES)).columns\n",
+    "\n",
+    "print(f\"Numeric columns ({len(numeric_cols)}): {numeric_cols}\")\n",
+    "print(f\"Categorical columns ({len(categorical_cols)}): {categorical_cols}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Distribution of numeric variables\n",
+    "if numeric_cols:\n",
+    "    n_cols = min(3, len(numeric_cols))\n",
+    "    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols\n",
+    "\n",
+    "    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))\n",
+    "    axes = axes.flatten() if len(numeric_cols) > 1 else [axes]\n",
+    "\n",
+    "    for i, col in enumerate(numeric_cols[: len(axes)]):\n",
+    "        data = df[col].to_numpy()\n",
+    "        data = data[~np.isnan(data)]  # Remove NaN values\n",
+    "        axes[i].hist(data, bins=30, alpha=0.7, edgecolor=\"black\")\n",
+    "        axes[i].set_title(f\"Distribution of {col}\")\n",
+    "        axes[i].set_xlabel(col)\n",
+    "        axes[i].set_ylabel(\"Frequency\")\n",
+    "\n",
+    "    # Hide empty subplots\n",
+    "    for i in range(len(numeric_cols), len(axes)):\n",
+    "        axes[i].set_visible(False)\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Value counts for categorical variables (top categories)\n",
+    "if categorical_cols:\n",
+    "    print(\"=== Top Categories ===\")\n",
+    "    for col in categorical_cols[:5]:  # Show first 5 categorical columns\n",
+    "        print(f\"\\n{col}:\")\n",
+    "        top_values = df[col].value_counts().head(10)\n",
+    "        print(top_values)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Correlation Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Correlation matrix for numeric variables\n",
+    "if len(numeric_cols) > 1:\n",
+    "    # Convert to pandas for correlation (Polars correlation support is limited)\n",
+    "    numeric_df = df.select(numeric_cols).to_pandas()\n",
+    "    correlation_matrix = numeric_df.corr()\n",
+    "\n",
+    "    plt.figure(figsize=(10, 8))\n",
+    "    sns.heatmap(\n",
+    "        correlation_matrix,\n",
+    "        annot=True,\n",
+    "        cmap=\"coolwarm\",\n",
+    "        center=0,\n",
+    "        square=True,\n",
+    "        linewidths=0.5,\n",
+    "    )\n",
+    "    plt.title(\"Correlation Matrix\")\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "else:\n",
+    "    print(\"Not enough numeric columns for correlation analysis\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Data Filtering & Transformation Examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example transformations with Polars\n",
+    "# Uncomment and modify as needed\n",
+    "\n",
+    "# # Filter data\n",
+    "# filtered_df = df.filter(pl.col(\"your_column\") > 100)\n",
+    "\n",
+    "# # Group by and aggregate\n",
+    "# grouped = df.group_by(\"category_column\").agg([\n",
+    "#     pl.col(\"numeric_column\").mean().alias(\"avg_value\"),\n",
+    "#     pl.col(\"numeric_column\").count().alias(\"count\")\n",
+    "# ])\n",
+    "\n",
+    "# # Create new columns\n",
+    "# df_with_new_col = df.with_columns([\n",
+    "#     (pl.col(\"col1\") / pl.col(\"col2\")).alias(\"ratio\"),\n",
+    "#     pl.col(\"date_column\").dt.year().alias(\"year\")\n",
+    "# ])\n",
+    "\n",
+    "print(\"Transform your data using Polars expressions above\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Export Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save processed data\n",
+    "# df.write_parquet(\"processed_data.parquet\")\n",
+    "# df.write_csv(\"processed_data.csv\")\n",
+    "\n",
+    "print(\"Analysis complete!\")\n",
+    "print(f\"Final dataset shape: {df.shape}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Reference: Common Polars Operations\n",
+    "\n",
+    "```python\n",
+    "# Loading data\n",
+    "df = pl.read_csv(\"file.csv\")\n",
+    "df = pl.read_parquet(\"file.parquet\")\n",
+    "df = pl.read_excel(\"file.xlsx\")\n",
+    "\n",
+    "# Basic operations\n",
+    "df.head(n)                    # First n rows\n",
+    "df.tail(n)                    # Last n rows\n",
+    "df.shape                      # Dimensions\n",
+    "df.columns                    # Column names\n",
+    "df.dtypes                     # Data types\n",
+    "df.describe()                 # Statistical summary\n",
+    "\n",
+    "# Filtering and selection\n",
+    "df.filter(pl.col(\"col\") > 10)\n",
+    "df.select([\"col1\", \"col2\"])\n",
+    "df.drop([\"col1\", \"col2\"])\n",
+    "\n",
+    "# Grouping and aggregation\n",
+    "df.group_by(\"col\").agg(pl.col(\"val\").sum())\n",
+    "df.group_by([\"col1\", \"col2\"]).count()\n",
+    "\n",
+    "# Transformations\n",
+    "df.with_columns(pl.col(\"col\").alias(\"new_name\"))\n",
+    "df.sort(\"col\", descending=True)\n",
+    "df.unique()\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}