Skip to content

Instantly share code, notes, and snippets.

@aimnemy
Created September 17, 2025 12:09
Show Gist options
  • Save aimnemy/93a5cc316cca46ae020ad28e740d33c5 to your computer and use it in GitHub Desktop.
Save aimnemy/93a5cc316cca46ae020ad28e740d33c5 to your computer and use it in GitHub Desktop.

Revisions

  1. aimnemy created this gist Sep 17, 2025.
    866 changes: 866 additions & 0 deletions eda_polars_template.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,866 @@
    {
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "# Prerequisite"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "import polars as pl\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from typing import Optional, Dict, Any, Union, List\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "# Data Exploration & Analysis with Polars\n",
    "\n",
    "Lean and efficient EDA template using Polars for fast data processing"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Function & Useful"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "def read_as_text_to_polars(\n",
    " file_path: Union[str, Path],\n",
    " sheet_name: Optional[Union[str, int]] = 0,\n",
    " encoding: str = \"utf-8\",\n",
    " fallback_encoding: str = \"latin-1\",\n",
    " skip_rows: int = 0,\n",
    " columns_to_use: Optional[List[str]] = None,\n",
    " clean_nulls: bool = True,\n",
    " **kwargs,\n",
    ") -> pl.DataFrame:\n",
    " \"\"\"\n",
    " Read various file formats into Polars DataFrame keeping all data as text/string.\n",
    " Preserves original formatting without type conversion.\n",
    "\n",
    " Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n",
    " \"\"\"\n",
    " file_path = Path(file_path)\n",
    " extension = file_path.suffix.lower()\n",
    "\n",
    " def force_text_types(df: pl.DataFrame) -> pl.DataFrame:\n",
    " \"\"\"Convert all columns to UTF8/String type\"\"\"\n",
    " for col in df.columns:\n",
    " if df[col].dtype != pl.Utf8:\n",
    " df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))\n",
    " return df\n",
    "\n",
    " def clean_text_values(df: pl.DataFrame) -> pl.DataFrame:\n",
    " \"\"\"Clean text values while preserving format\"\"\"\n",
    " null_values = [\"nan\", \"NaN\", \"NA\", \"N/A\", \"null\", \"NULL\", \"None\", \"NONE\", \"\"]\n",
    "\n",
    " for col in df.columns:\n",
    " # Trim whitespace first\n",
    " df = df.with_columns(pl.col(col).str.strip_chars().alias(col))\n",
    "\n",
    " # Replace null-like values with actual null\n",
    " df = df.with_columns(\n",
    " pl.when(pl.col(col).is_in(null_values))\n",
    " .then(None)\n",
    " .otherwise(pl.col(col))\n",
    " .alias(col)\n",
    " )\n",
    "\n",
    " return df\n",
    "\n",
    " try:\n",
    " # CSV/TSV\n",
    " if extension in [\".csv\", \".tsv\"]:\n",
    " separator = \"\\t\" if extension == \".tsv\" else \",\"\n",
    " try:\n",
    " # Force all columns to be read as UTF8\n",
    " df = pl.read_csv(\n",
    " file_path,\n",
    " separator=separator,\n",
    " encoding=encoding,\n",
    " skip_rows=skip_rows,\n",
    " dtypes=pl.Utf8, # Force all columns as text\n",
    " infer_schema_length=0, # Disable type inference\n",
    " ignore_errors=True,\n",
    " try_parse_dates=False, # Don't parse dates\n",
    " null_values=[\"\"], # Only treat empty string as null\n",
    " **kwargs,\n",
    " )\n",
    " except UnicodeDecodeError:\n",
    " # Fallback encoding\n",
    " df = pl.read_csv(\n",
    " file_path,\n",
    " separator=separator,\n",
    " encoding=fallback_encoding,\n",
    " skip_rows=skip_rows,\n",
    " dtypes=pl.Utf8,\n",
    " infer_schema_length=0,\n",
    " ignore_errors=True,\n",
    " try_parse_dates=False,\n",
    " null_values=[\"\"],\n",
    " **kwargs,\n",
    " )\n",
    " except Exception as csv_error:\n",
    " # Try pandas fallback\n",
    " pd_df = pd.read_csv(\n",
    " file_path,\n",
    " sep=separator,\n",
    " encoding=encoding,\n",
    " dtype=str,\n",
    " na_filter=False,\n",
    " skiprows=skip_rows,\n",
    " **kwargs,\n",
    " )\n",
    " df = pl.from_pandas(pd_df)\n",
    " df = force_text_types(df)\n",
    "\n",
    " # Excel formats\n",
    " elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n",
    " try:\n",
    " # Try Polars native reader first\n",
    " df = pl.read_excel(\n",
    " file_path, sheet_name=sheet_name, infer_schema_length=0, **kwargs\n",
    " )\n",
    " df = force_text_types(df)\n",
    " except Exception:\n",
    " # Fallback to pandas\n",
    " try:\n",
    " pd_df = pd.read_excel(\n",
    " file_path,\n",
    " sheet_name=sheet_name\n",
    " if isinstance(sheet_name, (str, int))\n",
    " else 0,\n",
    " dtype=str, # Force all columns as string\n",
    " na_filter=False, # Don't convert to NaN\n",
    " skiprows=skip_rows,\n",
    " **kwargs,\n",
    " )\n",
    " df = pl.from_pandas(pd_df)\n",
    " df = force_text_types(df)\n",
    " except Exception as excel_error:\n",
    " raise ValueError(f\"Cannot read Excel file: {excel_error}\")\n",
    "\n",
    " # Parquet\n",
    " elif extension == \".parquet\":\n",
    " df = pl.read_parquet(file_path, **kwargs)\n",
    " df = force_text_types(df)\n",
    "\n",
    " # JSON/JSONL\n",
    " elif extension in [\".json\", \".jsonl\"]:\n",
    " if extension == \".jsonl\":\n",
    " df = pl.read_ndjson(file_path, infer_schema_length=0, **kwargs)\n",
    " else:\n",
    " df = pl.read_json(file_path, **kwargs)\n",
    " df = force_text_types(df)\n",
    "\n",
    " # Avro\n",
    " elif extension == \".avro\":\n",
    " df = pl.read_avro(file_path, **kwargs)\n",
    " df = force_text_types(df)\n",
    "\n",
    " # Feather/Arrow\n",
    " elif extension in [\".feather\", \".arrow\"]:\n",
    " df = pl.read_ipc(file_path, **kwargs)\n",
    " df = force_text_types(df)\n",
    "\n",
    " else:\n",
    " raise ValueError(f\"Unsupported file format: {extension}\")\n",
    "\n",
    " # Clean column names (snake_case)\n",
    " clean_columns = {}\n",
    " for col in df.columns:\n",
    " # Handle empty column names\n",
    " if col == \"\" or col is None:\n",
    " new_col = f\"column_{len(clean_columns)}\"\n",
    " else:\n",
    " new_col = (\n",
    " col.strip()\n",
    " .replace(\" \", \"_\")\n",
    " .replace(\"-\", \"_\")\n",
    " .replace(\".\", \"_\")\n",
    " .lower()\n",
    " )\n",
    " # Handle duplicate column names\n",
    " if new_col in clean_columns.values():\n",
    " new_col = f\"{new_col}_{len(clean_columns)}\"\n",
    " clean_columns[col] = new_col\n",
    "\n",
    " df = df.rename(clean_columns)\n",
    "\n",
    " # Optional: Clean null-like text values\n",
    " if clean_nulls:\n",
    " df = clean_text_values(df)\n",
    "\n",
    " # Remove completely empty rows\n",
    " df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n",
    "\n",
    " # Remove completely empty columns\n",
    " non_empty_cols = [col for col in df.columns if not df[col].is_null().all()]\n",
    " if non_empty_cols:\n",
    " df = df.select(non_empty_cols)\n",
    "\n",
    " # Filter columns if specified\n",
    " if columns_to_use:\n",
    " available_cols = [col for col in columns_to_use if col in df.columns]\n",
    " if available_cols:\n",
    " df = df.select(available_cols)\n",
    "\n",
    " return df\n",
    "\n",
    " except Exception as e:\n",
    " raise ValueError(f\"Failed to read file '{file_path}': {str(e)}\")\n",
    "\n",
    "\n",
    "def read_preserve_format(\n",
    " file_path: Union[str, Path],\n",
    " preserve_leading_zeros: bool = True,\n",
    " preserve_decimal_places: bool = True,\n",
    " clean_nulls: bool = False, # Default False to preserve all text\n",
    " **kwargs,\n",
    ") -> pl.DataFrame:\n",
    " \"\"\"\n",
    " Enhanced version that preserves formatting like leading zeros and decimal places.\n",
    " \"\"\"\n",
    " df = read_as_text_to_polars(file_path, clean_nulls=clean_nulls, **kwargs)\n",
    "\n",
    " if not preserve_leading_zeros:\n",
    " # Remove leading zeros only from numeric-looking strings\n",
    " for col in df.columns:\n",
    " df = df.with_columns(\n",
    " pl.when(pl.col(col).str.contains(\"^0+[1-9]\\\\d*$\"))\n",
    " .then(pl.col(col).str.lstrip(\"0\"))\n",
    " .otherwise(pl.col(col))\n",
    " .alias(col)\n",
    " )\n",
    "\n",
    " if not preserve_decimal_places:\n",
    " # Remove trailing zeros after decimal\n",
    " for col in df.columns:\n",
    " df = df.with_columns(\n",
    " pl.when(pl.col(col).str.contains(\"\\\\d\\\\.\\\\d*0+$\"))\n",
    " .then(\n",
    " pl.col(col).str.strip_suffix_chars(\"0\").str.strip_suffix_chars(\".\")\n",
    " )\n",
    " .otherwise(pl.col(col))\n",
    " .alias(col)\n",
    " )\n",
    "\n",
    " return df\n",
    "\n",
    "\n",
    "# Quick helper function for simple reading\n",
    "def quick_read_text(file_path: Union[str, Path]) -> pl.DataFrame:\n",
    " \"\"\"\n",
    " Simplified function to quickly read any file as text without any processing.\n",
    " \"\"\"\n",
    " return read_as_text_to_polars(\n",
    " file_path,\n",
    " clean_nulls=False, # Keep everything as-is\n",
    " )\n",
    "\n",
    "\n",
    "# Setup function\n",
    "def setup_environment():\n",
    " \"\"\"Install required packages\"\"\"\n",
    " import subprocess\n",
    " import sys\n",
    "\n",
    " packages = [\"polars>=0.20.0\", \"pandas>=2.0.0\", \"openpyxl\", \"xlrd\", \"pyarrow\"]\n",
    " for package in packages:\n",
    " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n",
    "\n",
    "\n",
    "# # Example usage\n",
    "# if __name__ == \"__main__\":\n",
    "# # Simple read - preserves everything\n",
    "# df = quick_read_text(\"data.xlsx\")\n",
    "\n",
    "# # Read with options\n",
    "# df = read_as_text_to_polars(\n",
    "# \"data.xlsx\",\n",
    "# sheet_name=0,\n",
    "# clean_nulls=False # Keep \"NA\", \"null\" as text\n",
    "# )\n",
    "\n",
    "# # Read CSV\n",
    "# df_csv = read_as_text_to_polars(\n",
    "# \"data.csv\",\n",
    "# encoding='utf-8',\n",
    "# clean_nulls=True # Convert \"NA\", \"null\" to None\n",
    "# )\n",
    "\n",
    "# # Preserve special formatting\n",
    "# df_formatted = read_preserve_format(\n",
    "# \"data.xlsx\",\n",
    "# preserve_leading_zeros=True, # \"001\" stays \"001\"\n",
    "# preserve_decimal_places=True, # \"1.00\" stays \"1.00\"\n",
    "# clean_nulls=False\n",
    "# )\n",
    "\n",
    "# print(f\"Shape: {df.shape}\")\n",
    "# print(f\"Columns: {df.columns}\")\n",
    "# print(f\"Dtypes: {df.dtypes}\") # All pl.Utf8\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 1. Data Loading"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "import polars as pl\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from typing import Optional, Dict, Any, Union\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "\n",
    "def read_flexible_to_polars(\n",
    " file_path: Union[str, Path],\n",
    " sheet_name: Optional[Union[str, int]] = 0,\n",
    " encoding: str = \"utf-8\",\n",
    " fallback_encoding: str = \"latin-1\",\n",
    " infer_schema_length: int = 10000,\n",
    " **kwargs,\n",
    ") -> pl.DataFrame:\n",
    " \"\"\"\n",
    " Read various file formats into Polars DataFrame with dynamic datatype handling.\n",
    "\n",
    " Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n",
    " \"\"\"\n",
    " file_path = Path(file_path)\n",
    " extension = file_path.suffix.lower()\n",
    "\n",
    " def safe_type_convert(df: pl.DataFrame) -> pl.DataFrame:\n",
    " for col in df.columns:\n",
    " try:\n",
    " if df[col].dtype == pl.Utf8:\n",
    " # Try numeric conversion\n",
    " try:\n",
    " temp = df[col].str.replace_all(\",\", \"\").str.strip_chars()\n",
    " if temp.str.contains(\"^-?\\\\d+$\").all():\n",
    " df = df.with_columns(temp.cast(pl.Int64).alias(col))\n",
    " elif temp.str.contains(\"^-?\\\\d*\\\\.?\\\\d+$\").all():\n",
    " df = df.with_columns(temp.cast(pl.Float64).alias(col))\n",
    " except:\n",
    " pass\n",
    "\n",
    " # Try date conversion\n",
    " try:\n",
    " if df[col].str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}\").any():\n",
    " df = df.with_columns(\n",
    " pl.col(col)\n",
    " .str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n",
    " .alias(col)\n",
    " )\n",
    " except:\n",
    " pass\n",
    "\n",
    " # Try datetime conversion\n",
    " try:\n",
    " if (\n",
    " df[col]\n",
    " .str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}.*\\\\d{2}:\\\\d{2}\")\n",
    " .any()\n",
    " ):\n",
    " df = df.with_columns(\n",
    " pl.col(col)\n",
    " .str.strptime(\n",
    " pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False\n",
    " )\n",
    " .alias(col)\n",
    " )\n",
    " except:\n",
    " pass\n",
    "\n",
    " # Try boolean conversion\n",
    " unique_vals = df[col].unique().drop_nulls()\n",
    " if len(unique_vals) <= 2:\n",
    " lower_vals = unique_vals.str.to_lowercase()\n",
    " if set(lower_vals) <= {\n",
    " \"true\",\n",
    " \"false\",\n",
    " \"yes\",\n",
    " \"no\",\n",
    " \"1\",\n",
    " \"0\",\n",
    " \"y\",\n",
    " \"n\",\n",
    " }:\n",
    " mapping = {\n",
    " \"true\": True,\n",
    " \"false\": False,\n",
    " \"yes\": True,\n",
    " \"no\": False,\n",
    " \"1\": True,\n",
    " \"0\": False,\n",
    " \"y\": True,\n",
    " \"n\": False,\n",
    " }\n",
    " df = df.with_columns(\n",
    " pl.col(col)\n",
    " .str.to_lowercase()\n",
    " .map_elements(\n",
    " lambda x: mapping.get(x, None),\n",
    " return_dtype=pl.Boolean,\n",
    " )\n",
    " .alias(col)\n",
    " )\n",
    " except:\n",
    " continue\n",
    " return df\n",
    "\n",
    " try:\n",
    " # CSV/TSV\n",
    " if extension in [\".csv\", \".tsv\"]:\n",
    " separator = \"\\t\" if extension == \".tsv\" else \",\"\n",
    " try:\n",
    " df = pl.read_csv(\n",
    " file_path,\n",
    " separator=separator,\n",
    " encoding=encoding,\n",
    " infer_schema_length=infer_schema_length,\n",
    " ignore_errors=True,\n",
    " try_parse_dates=True,\n",
    " **kwargs,\n",
    " )\n",
    " except:\n",
    " df = pl.read_csv(\n",
    " file_path,\n",
    " separator=separator,\n",
    " encoding=fallback_encoding,\n",
    " infer_schema_length=None,\n",
    " ignore_errors=True,\n",
    " **kwargs,\n",
    " )\n",
    "\n",
    " # Excel formats\n",
    " elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n",
    " try:\n",
    " df = pl.read_excel(\n",
    " file_path,\n",
    " sheet_name=sheet_name,\n",
    " infer_schema_length=infer_schema_length,\n",
    " **kwargs,\n",
    " )\n",
    " except:\n",
    " # Fallback to pandas\n",
    " pd_df = pd.read_excel(\n",
    " file_path,\n",
    " sheet_name=sheet_name if isinstance(sheet_name, (str, int)) else 0,\n",
    " **kwargs,\n",
    " )\n",
    " pd_df = pd_df.astype(str).replace([\"nan\", \"NaN\", \"None\"], None)\n",
    " df = pl.from_pandas(pd_df)\n",
    "\n",
    " # Parquet\n",
    " elif extension == \".parquet\":\n",
    " df = pl.read_parquet(file_path, **kwargs)\n",
    "\n",
    " # JSON/JSONL\n",
    " elif extension in [\".json\", \".jsonl\"]:\n",
    " if extension == \".jsonl\":\n",
    " df = pl.read_ndjson(\n",
    " file_path, infer_schema_length=infer_schema_length, **kwargs\n",
    " )\n",
    " else:\n",
    " df = pl.read_json(file_path, **kwargs)\n",
    "\n",
    " # Avro\n",
    " elif extension == \".avro\":\n",
    " df = pl.read_avro(file_path, **kwargs)\n",
    "\n",
    " # Feather/Arrow\n",
    " elif extension in [\".feather\", \".arrow\"]:\n",
    " df = pl.read_ipc(file_path, **kwargs)\n",
    "\n",
    " else:\n",
    " raise ValueError(f\"Unsupported file format: {extension}\")\n",
    "\n",
    " # Clean column names\n",
    " df = df.rename(\n",
    " {col: col.strip().replace(\" \", \"_\").lower() for col in df.columns}\n",
    " )\n",
    "\n",
    " # Apply smart type conversion\n",
    " df = safe_type_convert(df)\n",
    "\n",
    " # Remove completely empty rows and columns\n",
    " df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n",
    " df = df.select([col for col in df.columns if not df[col].is_null().all()])\n",
    "\n",
    " return df\n",
    "\n",
    " except Exception as e:\n",
    " # Ultimate fallback: read as string and convert\n",
    " try:\n",
    " if extension in [\".csv\", \".tsv\"]:\n",
    " df = pl.read_csv(\n",
    " file_path,\n",
    " separator=\"\\t\" if extension == \".tsv\" else \",\",\n",
    " encoding=\"utf-8\",\n",
    " infer_schema_length=None,\n",
    " dtypes={col: pl.Utf8 for col in range(100)},\n",
    " ignore_errors=True,\n",
    " has_header=True,\n",
    " )\n",
    " return safe_type_convert(df)\n",
    " except:\n",
    " pass\n",
    " raise ValueError(f\"Failed to read file: {e}\")\n",
    "\n",
    "\n",
    "# Usage setup\n",
    "def setup_environment():\n",
    " \"\"\"Install required packages\"\"\"\n",
    " import subprocess\n",
    " import sys\n",
    "\n",
    " packages = [\"polars\", \"pandas\", \"openpyxl\", \"xlrd\", \"pyarrow\", \"fastparquet\"]\n",
    " for package in packages:\n",
    " subprocess.check_call(\n",
    " [sys.executable, \"-m\", \"uv pip\", \"install\", \"-q\", package]\n",
    " )\n",
    "\n",
    "\n",
    "# # Load data using the robust function\n",
    "# file_path = \"input/202508_new_checking_with_final_output_analysis_with_ruleV2_asof_20250915_v3.xlsx\"\n",
    "\n",
    "# # Read any supported file\n",
    "# main_vatwht_df = read_flexible_to_polars(file_path, sheet_name=0)\n",
    "\n",
    "# # With specific encoding for CSV\n",
    "# main_vatwht_df = read_flexible_to_polars(file_path, encoding=\"utf-8\")\n",
    "\n",
    "# # With additional parameters\n",
    "# main_vatwht_df = read_flexible_to_polars(\n",
    "# file_path, sheet_name=\"Sheet1\", infer_schema_length=50000\n",
    "# )\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 2. Data Overview"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Basic info\n",
    "print(\"=== Dataset Information ===\")\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Columns: {df.columns}\")\n",
    "print(\"\\n=== Data Types ===\")\n",
    "print(df.dtypes)\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Schema overview\n",
    "df.schema\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Statistical summary\n",
    "df.describe()\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 3. Data Quality Assessment"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Check for missing values\n",
    "missing_info = df.null_count()\n",
    "missing_pct = (missing_info / len(df) * 100).round(2)\n",
    "\n",
    "quality_summary = pl.DataFrame(\n",
    " {\n",
    " \"column\": df.columns,\n",
    " \"null_count\": missing_info.row(0),\n",
    " \"null_percentage\": missing_pct.row(0),\n",
    " }\n",
    ")\n",
    "\n",
    "print(\"=== Missing Values Summary ===\")\n",
    "quality_summary.filter(pl.col(\"null_count\") > 0)\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Check for duplicates\n",
    "duplicate_count = df.height - df.unique().height\n",
    "print(f\"Duplicate rows: {duplicate_count}\")\n",
    "print(f\"Duplicate percentage: {(duplicate_count / df.height * 100):.2f}%\")\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 4. Exploratory Data Analysis"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Select numeric columns for analysis\n",
    "numeric_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns\n",
    "categorical_cols = df.select(pl.col(pl.STRING_DTYPES)).columns\n",
    "\n",
    "print(f\"Numeric columns ({len(numeric_cols)}): {numeric_cols}\")\n",
    "print(f\"Categorical columns ({len(categorical_cols)}): {categorical_cols}\")\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Distribution of numeric variables\n",
    "if numeric_cols:\n",
    " n_cols = min(3, len(numeric_cols))\n",
    " n_rows = (len(numeric_cols) + n_cols - 1) // n_cols\n",
    "\n",
    " fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))\n",
    " axes = axes.flatten() if len(numeric_cols) > 1 else [axes]\n",
    "\n",
    " for i, col in enumerate(numeric_cols[: len(axes)]):\n",
    " data = df[col].to_numpy()\n",
    " data = data[~np.isnan(data)] # Remove NaN values\n",
    " axes[i].hist(data, bins=30, alpha=0.7, edgecolor=\"black\")\n",
    " axes[i].set_title(f\"Distribution of {col}\")\n",
    " axes[i].set_xlabel(col)\n",
    " axes[i].set_ylabel(\"Frequency\")\n",
    "\n",
    " # Hide empty subplots\n",
    " for i in range(len(numeric_cols), len(axes)):\n",
    " axes[i].set_visible(False)\n",
    "\n",
    " plt.tight_layout()\n",
    " plt.show()\n"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Value counts for categorical variables (top categories)\n",
    "if categorical_cols:\n",
    " print(\"=== Top Categories ===\")\n",
    " for col in categorical_cols[:5]: # Show first 5 categorical columns\n",
    " print(f\"\\n{col}:\")\n",
    " top_values = df[col].value_counts().head(10)\n",
    " print(top_values)\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 5. Correlation Analysis"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Correlation matrix for numeric variables\n",
    "if len(numeric_cols) > 1:\n",
    " # Convert to pandas for correlation (Polars correlation support is limited)\n",
    " numeric_df = df.select(numeric_cols).to_pandas()\n",
    " correlation_matrix = numeric_df.corr()\n",
    "\n",
    " plt.figure(figsize=(10, 8))\n",
    " sns.heatmap(\n",
    " correlation_matrix,\n",
    " annot=True,\n",
    " cmap=\"coolwarm\",\n",
    " center=0,\n",
    " square=True,\n",
    " linewidths=0.5,\n",
    " )\n",
    " plt.title(\"Correlation Matrix\")\n",
    " plt.tight_layout()\n",
    " plt.show()\n",
    "else:\n",
    " print(\"Not enough numeric columns for correlation analysis\")\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 6. Data Filtering & Transformation Examples"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Example transformations with Polars\n",
    "# Uncomment and modify as needed\n",
    "\n",
    "# # Filter data\n",
    "# filtered_df = df.filter(pl.col(\"your_column\") > 100)\n",
    "\n",
    "# # Group by and aggregate\n",
    "# grouped = df.group_by(\"category_column\").agg([\n",
    "# pl.col(\"numeric_column\").mean().alias(\"avg_value\"),\n",
    "# pl.col(\"numeric_column\").count().alias(\"count\")\n",
    "# ])\n",
    "\n",
    "# # Create new columns\n",
    "# df_with_new_col = df.with_columns([\n",
    "# (pl.col(\"col1\") / pl.col(\"col2\")).alias(\"ratio\"),\n",
    "# pl.col(\"date_column\").dt.year().alias(\"year\")\n",
    "# ])\n",
    "\n",
    "print(\"Transform your data using Polars expressions above\")\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## 7. Export Results"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "# Save processed data\n",
    "# df.write_parquet(\"processed_data.parquet\")\n",
    "# df.write_csv(\"processed_data.csv\")\n",
    "\n",
    "print(\"Analysis complete!\")\n",
    "print(f\"Final dataset shape: {df.shape}\")\n"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Quick Reference: Common Polars Operations\n",
    "\n",
    "```python\n",
    "# Loading data\n",
    "df = pl.read_csv(\"file.csv\")\n",
    "df = pl.read_parquet(\"file.parquet\")\n",
    "df = pl.read_excel(\"file.xlsx\")\n",
    "\n",
    "# Basic operations\n",
    "df.head(n) # First n rows\n",
    "df.tail(n) # Last n rows\n",
    "df.shape # Dimensions\n",
    "df.columns # Column names\n",
    "df.dtypes # Data types\n",
    "df.describe() # Statistical summary\n",
    "\n",
    "# Filtering and selection\n",
    "df.filter(pl.col(\"col\") > 10)\n",
    "df.select([\"col1\", \"col2\"])\n",
    "df.drop([\"col1\", \"col2\"])\n",
    "\n",
    "# Grouping and aggregation\n",
    "df.group_by(\"col\").agg(pl.col(\"val\").sum())\n",
    "df.group_by([\"col1\", \"col2\"]).count()\n",
    "\n",
    "# Transformations\n",
    "df.with_columns(pl.col(\"col\").alias(\"new_name\"))\n",
    "df.sort(\"col\", descending=True)\n",
    "df.unique()\n",
    "```"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.0"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 4
    }