Created
September 17, 2025 12:09
-
-
Save aimnemy/93a5cc316cca46ae020ad28e740d33c5 to your computer and use it in GitHub Desktop.
Revisions
-
aimnemy created this gist
Sep 17, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,866 @@ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prerequisite" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import pandas as pd\n", "from pathlib import Path\n", "from typing import Optional, Dict, Any, Union, List\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Exploration & Analysis with Polars\n", "\n", "Lean and efficient EDA template using Polars for fast data processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Function & Useful" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_as_text_to_polars(\n", " file_path: Union[str, Path],\n", " sheet_name: Optional[Union[str, int]] = 0,\n", " encoding: str = \"utf-8\",\n", " fallback_encoding: str = \"latin-1\",\n", " skip_rows: int = 0,\n", " columns_to_use: Optional[List[str]] = None,\n", " clean_nulls: bool = True,\n", " **kwargs,\n", ") -> pl.DataFrame:\n", " \"\"\"\n", " Read various file formats into Polars DataFrame keeping all data as text/string.\n", " Preserves original formatting without type conversion.\n", "\n", " Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n", " \"\"\"\n", " file_path = Path(file_path)\n", " extension = file_path.suffix.lower()\n", "\n", " def force_text_types(df: pl.DataFrame) -> pl.DataFrame:\n", " \"\"\"Convert all columns to UTF8/String type\"\"\"\n", " for col in df.columns:\n", " if df[col].dtype != pl.Utf8:\n", " df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))\n", " return df\n", "\n", " def clean_text_values(df: pl.DataFrame) -> pl.DataFrame:\n", " \"\"\"Clean text values while preserving format\"\"\"\n", " null_values = [\"nan\", \"NaN\", \"NA\", \"N/A\", \"null\", \"NULL\", \"None\", \"NONE\", \"\"]\n", "\n", " for col in df.columns:\n", " # Trim whitespace first\n", " df = df.with_columns(pl.col(col).str.strip_chars().alias(col))\n", "\n", " # Replace null-like values with actual null\n", " df = df.with_columns(\n", " pl.when(pl.col(col).is_in(null_values))\n", " .then(None)\n", " .otherwise(pl.col(col))\n", " .alias(col)\n", " )\n", "\n", " return df\n", "\n", " try:\n", " # CSV/TSV\n", " if extension in [\".csv\", \".tsv\"]:\n", " separator = \"\\t\" if extension == \".tsv\" else \",\"\n", " try:\n", " # Force all columns to be read as UTF8\n", " df = pl.read_csv(\n", " file_path,\n", " separator=separator,\n", " encoding=encoding,\n", " skip_rows=skip_rows,\n", " dtypes=pl.Utf8, # Force all columns as text\n", " infer_schema_length=0, # Disable type inference\n", " ignore_errors=True,\n", " try_parse_dates=False, # Don't parse dates\n", " null_values=[\"\"], # Only treat empty string as null\n", " **kwargs,\n", " )\n", " except UnicodeDecodeError:\n", " # Fallback encoding\n", " df = pl.read_csv(\n", " file_path,\n", " separator=separator,\n", " encoding=fallback_encoding,\n", " skip_rows=skip_rows,\n", " dtypes=pl.Utf8,\n", " infer_schema_length=0,\n", " ignore_errors=True,\n", " try_parse_dates=False,\n", " null_values=[\"\"],\n", " **kwargs,\n", " )\n", " except Exception as csv_error:\n", " # Try pandas fallback\n", " pd_df = pd.read_csv(\n", " file_path,\n", " sep=separator,\n", " encoding=encoding,\n", " dtype=str,\n", " na_filter=False,\n", " skiprows=skip_rows,\n", " **kwargs,\n", " )\n", " df = pl.from_pandas(pd_df)\n", " df = force_text_types(df)\n", "\n", " # Excel formats\n", " elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n", " try:\n", " # Try Polars native reader first\n", " df = pl.read_excel(\n", " file_path, sheet_name=sheet_name, infer_schema_length=0, **kwargs\n", " )\n", " df = force_text_types(df)\n", " except Exception:\n", " # Fallback to pandas\n", " try:\n", " pd_df = pd.read_excel(\n", " file_path,\n", " sheet_name=sheet_name\n", " if isinstance(sheet_name, (str, int))\n", " else 0,\n", " dtype=str, # Force all columns as string\n", " na_filter=False, # Don't convert to NaN\n", " skiprows=skip_rows,\n", " **kwargs,\n", " )\n", " df = pl.from_pandas(pd_df)\n", " df = force_text_types(df)\n", " except Exception as excel_error:\n", " raise ValueError(f\"Cannot read Excel file: {excel_error}\")\n", "\n", " # Parquet\n", " elif extension == \".parquet\":\n", " df = pl.read_parquet(file_path, **kwargs)\n", " df = force_text_types(df)\n", "\n", " # JSON/JSONL\n", " elif extension in [\".json\", \".jsonl\"]:\n", " if extension == \".jsonl\":\n", " df = pl.read_ndjson(file_path, infer_schema_length=0, **kwargs)\n", " else:\n", " df = pl.read_json(file_path, **kwargs)\n", " df = force_text_types(df)\n", "\n", " # Avro\n", " elif extension == \".avro\":\n", " df = pl.read_avro(file_path, **kwargs)\n", " df = force_text_types(df)\n", "\n", " # Feather/Arrow\n", " elif extension in [\".feather\", \".arrow\"]:\n", " df = pl.read_ipc(file_path, **kwargs)\n", " df = force_text_types(df)\n", "\n", " else:\n", " raise ValueError(f\"Unsupported file format: {extension}\")\n", "\n", " # Clean column names (snake_case)\n", " clean_columns = {}\n", " for col in df.columns:\n", " # Handle empty column names\n", " if col == \"\" or col is None:\n", " new_col = f\"column_{len(clean_columns)}\"\n", " else:\n", " new_col = (\n", " col.strip()\n", " .replace(\" \", \"_\")\n", " .replace(\"-\", \"_\")\n", " .replace(\".\", \"_\")\n", " .lower()\n", " )\n", " # Handle duplicate column names\n", " if new_col in clean_columns.values():\n", " new_col = f\"{new_col}_{len(clean_columns)}\"\n", " clean_columns[col] = new_col\n", "\n", " df = df.rename(clean_columns)\n", "\n", " # Optional: Clean null-like text values\n", " if clean_nulls:\n", " df = clean_text_values(df)\n", "\n", " # Remove completely empty rows\n", " df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n", "\n", " # Remove completely empty columns\n", " non_empty_cols = [col for col in df.columns if not df[col].is_null().all()]\n", " if non_empty_cols:\n", " df = df.select(non_empty_cols)\n", "\n", " # Filter columns if specified\n", " if columns_to_use:\n", " available_cols = [col for col in columns_to_use if col in df.columns]\n", " if available_cols:\n", " df = df.select(available_cols)\n", "\n", " return df\n", "\n", " except Exception as e:\n", " raise ValueError(f\"Failed to read file '{file_path}': {str(e)}\")\n", "\n", "\n", "def read_preserve_format(\n", " file_path: Union[str, Path],\n", " preserve_leading_zeros: bool = True,\n", " preserve_decimal_places: bool = True,\n", " clean_nulls: bool = False, # Default False to preserve all text\n", " **kwargs,\n", ") -> pl.DataFrame:\n", " \"\"\"\n", " Enhanced version that preserves formatting like leading zeros and decimal places.\n", " \"\"\"\n", " df = read_as_text_to_polars(file_path, clean_nulls=clean_nulls, **kwargs)\n", "\n", " if not preserve_leading_zeros:\n", " # Remove leading zeros only from numeric-looking strings\n", " for col in df.columns:\n", " df = df.with_columns(\n", " pl.when(pl.col(col).str.contains(\"^0+[1-9]\\\\d*$\"))\n", " .then(pl.col(col).str.lstrip(\"0\"))\n", " .otherwise(pl.col(col))\n", " .alias(col)\n", " )\n", "\n", " if not preserve_decimal_places:\n", " # Remove trailing zeros after decimal\n", " for col in df.columns:\n", " df = df.with_columns(\n", " pl.when(pl.col(col).str.contains(\"\\\\d\\\\.\\\\d*0+$\"))\n", " .then(\n", " pl.col(col).str.strip_suffix_chars(\"0\").str.strip_suffix_chars(\".\")\n", " )\n", " .otherwise(pl.col(col))\n", " .alias(col)\n", " )\n", "\n", " return df\n", "\n", "\n", "# Quick helper function for simple reading\n", "def quick_read_text(file_path: Union[str, Path]) -> pl.DataFrame:\n", " \"\"\"\n", " Simplified function to quickly read any file as text without any processing.\n", " \"\"\"\n", " return read_as_text_to_polars(\n", " file_path,\n", " clean_nulls=False, # Keep everything as-is\n", " )\n", "\n", "\n", "# Setup function\n", "def setup_environment():\n", " \"\"\"Install required packages\"\"\"\n", " import subprocess\n", " import sys\n", "\n", " packages = [\"polars>=0.20.0\", \"pandas>=2.0.0\", \"openpyxl\", \"xlrd\", \"pyarrow\"]\n", " for package in packages:\n", " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n", "\n", "\n", "# # Example usage\n", "# if __name__ == \"__main__\":\n", "# # Simple read - preserves everything\n", "# df = quick_read_text(\"data.xlsx\")\n", "\n", "# # Read with options\n", "# df = read_as_text_to_polars(\n", "# \"data.xlsx\",\n", "# sheet_name=0,\n", "# clean_nulls=False # Keep \"NA\", \"null\" as text\n", "# )\n", "\n", "# # Read CSV\n", "# df_csv = read_as_text_to_polars(\n", "# \"data.csv\",\n", "# encoding='utf-8',\n", "# clean_nulls=True # Convert \"NA\", \"null\" to None\n", "# )\n", "\n", "# # Preserve special formatting\n", "# df_formatted = read_preserve_format(\n", "# \"data.xlsx\",\n", "# preserve_leading_zeros=True, # \"001\" stays \"001\"\n", "# preserve_decimal_places=True, # \"1.00\" stays \"1.00\"\n", "# clean_nulls=False\n", "# )\n", "\n", "# print(f\"Shape: {df.shape}\")\n", "# print(f\"Columns: {df.columns}\")\n", "# print(f\"Dtypes: {df.dtypes}\") # All pl.Utf8\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Data Loading" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import pandas as pd\n", "from pathlib import Path\n", "from typing import Optional, Dict, Any, Union\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "\n", "def read_flexible_to_polars(\n", " file_path: Union[str, Path],\n", " sheet_name: Optional[Union[str, int]] = 0,\n", " encoding: str = \"utf-8\",\n", " fallback_encoding: str = \"latin-1\",\n", " infer_schema_length: int = 10000,\n", " **kwargs,\n", ") -> pl.DataFrame:\n", " \"\"\"\n", " Read various file formats into Polars DataFrame with dynamic datatype handling.\n", "\n", " Supported: csv, tsv, xlsx, xlsm, xls, parquet, json, jsonl, avro, feather\n", " \"\"\"\n", " file_path = Path(file_path)\n", " extension = file_path.suffix.lower()\n", "\n", " def safe_type_convert(df: pl.DataFrame) -> pl.DataFrame:\n", " for col in df.columns:\n", " try:\n", " if df[col].dtype == pl.Utf8:\n", " # Try numeric conversion\n", " try:\n", " temp = df[col].str.replace_all(\",\", \"\").str.strip_chars()\n", " if temp.str.contains(\"^-?\\\\d+$\").all():\n", " df = df.with_columns(temp.cast(pl.Int64).alias(col))\n", " elif temp.str.contains(\"^-?\\\\d*\\\\.?\\\\d+$\").all():\n", " df = df.with_columns(temp.cast(pl.Float64).alias(col))\n", " except:\n", " pass\n", "\n", " # Try date conversion\n", " try:\n", " if df[col].str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}\").any():\n", " df = df.with_columns(\n", " pl.col(col)\n", " .str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n", " .alias(col)\n", " )\n", " except:\n", " pass\n", "\n", " # Try datetime conversion\n", " try:\n", " if (\n", " df[col]\n", " .str.contains(\"\\\\d{4}-\\\\d{2}-\\\\d{2}.*\\\\d{2}:\\\\d{2}\")\n", " .any()\n", " ):\n", " df = df.with_columns(\n", " pl.col(col)\n", " .str.strptime(\n", " pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False\n", " )\n", " .alias(col)\n", " )\n", " except:\n", " pass\n", "\n", " # Try boolean conversion\n", " unique_vals = df[col].unique().drop_nulls()\n", " if len(unique_vals) <= 2:\n", " lower_vals = unique_vals.str.to_lowercase()\n", " if set(lower_vals) <= {\n", " \"true\",\n", " \"false\",\n", " \"yes\",\n", " \"no\",\n", " \"1\",\n", " \"0\",\n", " \"y\",\n", " \"n\",\n", " }:\n", " mapping = {\n", " \"true\": True,\n", " \"false\": False,\n", " \"yes\": True,\n", " \"no\": False,\n", " \"1\": True,\n", " \"0\": False,\n", " \"y\": True,\n", " \"n\": False,\n", " }\n", " df = df.with_columns(\n", " pl.col(col)\n", " .str.to_lowercase()\n", " .map_elements(\n", " lambda x: mapping.get(x, None),\n", " return_dtype=pl.Boolean,\n", " )\n", " .alias(col)\n", " )\n", " except:\n", " continue\n", " return df\n", "\n", " try:\n", " # CSV/TSV\n", " if extension in [\".csv\", \".tsv\"]:\n", " separator = \"\\t\" if extension == \".tsv\" else \",\"\n", " try:\n", " df = pl.read_csv(\n", " file_path,\n", " separator=separator,\n", " encoding=encoding,\n", " infer_schema_length=infer_schema_length,\n", " ignore_errors=True,\n", " try_parse_dates=True,\n", " **kwargs,\n", " )\n", " except:\n", " df = pl.read_csv(\n", " file_path,\n", " separator=separator,\n", " encoding=fallback_encoding,\n", " infer_schema_length=None,\n", " ignore_errors=True,\n", " **kwargs,\n", " )\n", "\n", " # Excel formats\n", " elif extension in [\".xlsx\", \".xlsm\", \".xls\"]:\n", " try:\n", " df = pl.read_excel(\n", " file_path,\n", " sheet_name=sheet_name,\n", " infer_schema_length=infer_schema_length,\n", " **kwargs,\n", " )\n", " except:\n", " # Fallback to pandas\n", " pd_df = pd.read_excel(\n", " file_path,\n", " sheet_name=sheet_name if isinstance(sheet_name, (str, int)) else 0,\n", " **kwargs,\n", " )\n", " pd_df = pd_df.astype(str).replace([\"nan\", \"NaN\", \"None\"], None)\n", " df = pl.from_pandas(pd_df)\n", "\n", " # Parquet\n", " elif extension == \".parquet\":\n", " df = pl.read_parquet(file_path, **kwargs)\n", "\n", " # JSON/JSONL\n", " elif extension in [\".json\", \".jsonl\"]:\n", " if extension == \".jsonl\":\n", " df = pl.read_ndjson(\n", " file_path, infer_schema_length=infer_schema_length, **kwargs\n", " )\n", " else:\n", " df = pl.read_json(file_path, **kwargs)\n", "\n", " # Avro\n", " elif extension == \".avro\":\n", " df = pl.read_avro(file_path, **kwargs)\n", "\n", " # Feather/Arrow\n", " elif extension in [\".feather\", \".arrow\"]:\n", " df = pl.read_ipc(file_path, **kwargs)\n", "\n", " else:\n", " raise ValueError(f\"Unsupported file format: {extension}\")\n", "\n", " # Clean column names\n", " df = df.rename(\n", " {col: col.strip().replace(\" \", \"_\").lower() for col in df.columns}\n", " )\n", "\n", " # Apply smart type conversion\n", " df = safe_type_convert(df)\n", "\n", " # Remove completely empty rows and columns\n", " df = df.filter(~pl.all_horizontal(pl.all().is_null()))\n", " df = df.select([col for col in df.columns if not df[col].is_null().all()])\n", "\n", " return df\n", "\n", " except Exception as e:\n", " # Ultimate fallback: read as string and convert\n", " try:\n", " if extension in [\".csv\", \".tsv\"]:\n", " df = pl.read_csv(\n", " file_path,\n", " separator=\"\\t\" if extension == \".tsv\" else \",\",\n", " encoding=\"utf-8\",\n", " infer_schema_length=None,\n", " dtypes={col: pl.Utf8 for col in range(100)},\n", " ignore_errors=True,\n", " has_header=True,\n", " )\n", " return safe_type_convert(df)\n", " except:\n", " pass\n", " raise ValueError(f\"Failed to read file: {e}\")\n", "\n", "\n", "# Usage setup\n", "def setup_environment():\n", " \"\"\"Install required packages\"\"\"\n", " import subprocess\n", " import sys\n", "\n", " packages = [\"polars\", \"pandas\", \"openpyxl\", \"xlrd\", \"pyarrow\", \"fastparquet\"]\n", " for package in packages:\n", " subprocess.check_call(\n", " [sys.executable, \"-m\", \"uv pip\", \"install\", \"-q\", package]\n", " )\n", "\n", "\n", "# # Load data using the robust function\n", "# file_path = \"input/202508_new_checking_with_final_output_analysis_with_ruleV2_asof_20250915_v3.xlsx\"\n", "\n", "# # Read any supported file\n", "# main_vatwht_df = read_flexible_to_polars(file_path, sheet_name=0)\n", "\n", "# # With specific encoding for CSV\n", "# main_vatwht_df = read_flexible_to_polars(file_path, encoding=\"utf-8\")\n", "\n", "# # With additional parameters\n", "# main_vatwht_df = read_flexible_to_polars(\n", "# file_path, sheet_name=\"Sheet1\", infer_schema_length=50000\n", "# )\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Data Overview" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Basic info\n", "print(\"=== Dataset Information ===\")\n", "print(f\"Shape: {df.shape}\")\n", "print(f\"Columns: {df.columns}\")\n", "print(\"\\n=== Data Types ===\")\n", "print(df.dtypes)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Schema overview\n", "df.schema\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Statistical summary\n", "df.describe()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Data Quality Assessment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check for missing values\n", "missing_info = df.null_count()\n", "missing_pct = (missing_info / len(df) * 100).round(2)\n", "\n", "quality_summary = pl.DataFrame(\n", " {\n", " \"column\": df.columns,\n", " \"null_count\": missing_info.row(0),\n", " \"null_percentage\": missing_pct.row(0),\n", " }\n", ")\n", "\n", "print(\"=== Missing Values Summary ===\")\n", "quality_summary.filter(pl.col(\"null_count\") > 0)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check for duplicates\n", "duplicate_count = df.height - df.unique().height\n", "print(f\"Duplicate rows: {duplicate_count}\")\n", "print(f\"Duplicate percentage: {(duplicate_count / df.height * 100):.2f}%\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Exploratory Data Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Select numeric columns for analysis\n", "numeric_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns\n", "categorical_cols = df.select(pl.col(pl.STRING_DTYPES)).columns\n", "\n", "print(f\"Numeric columns ({len(numeric_cols)}): {numeric_cols}\")\n", "print(f\"Categorical columns ({len(categorical_cols)}): {categorical_cols}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Distribution of numeric variables\n", "if numeric_cols:\n", " n_cols = min(3, len(numeric_cols))\n", " n_rows = (len(numeric_cols) + n_cols - 1) // n_cols\n", "\n", " fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))\n", " axes = axes.flatten() if len(numeric_cols) > 1 else [axes]\n", "\n", " for i, col in enumerate(numeric_cols[: len(axes)]):\n", " data = df[col].to_numpy()\n", " data = data[~np.isnan(data)] # Remove NaN values\n", " axes[i].hist(data, bins=30, alpha=0.7, edgecolor=\"black\")\n", " axes[i].set_title(f\"Distribution of {col}\")\n", " axes[i].set_xlabel(col)\n", " axes[i].set_ylabel(\"Frequency\")\n", "\n", " # Hide empty subplots\n", " for i in range(len(numeric_cols), len(axes)):\n", " axes[i].set_visible(False)\n", "\n", " plt.tight_layout()\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Value counts for categorical variables (top categories)\n", "if categorical_cols:\n", " print(\"=== Top Categories ===\")\n", " for col in categorical_cols[:5]: # Show first 5 categorical columns\n", " print(f\"\\n{col}:\")\n", " top_values = df[col].value_counts().head(10)\n", " print(top_values)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Correlation Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Correlation matrix for numeric variables\n", "if len(numeric_cols) > 1:\n", " # Convert to pandas for correlation (Polars correlation support is limited)\n", " numeric_df = df.select(numeric_cols).to_pandas()\n", " correlation_matrix = numeric_df.corr()\n", "\n", " plt.figure(figsize=(10, 8))\n", " sns.heatmap(\n", " correlation_matrix,\n", " annot=True,\n", " cmap=\"coolwarm\",\n", " center=0,\n", " square=True,\n", " linewidths=0.5,\n", " )\n", " plt.title(\"Correlation Matrix\")\n", " plt.tight_layout()\n", " plt.show()\n", "else:\n", " print(\"Not enough numeric columns for correlation analysis\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Data Filtering & Transformation Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Example transformations with Polars\n", "# Uncomment and modify as needed\n", "\n", "# # Filter data\n", "# filtered_df = df.filter(pl.col(\"your_column\") > 100)\n", "\n", "# # Group by and aggregate\n", "# grouped = df.group_by(\"category_column\").agg([\n", "# pl.col(\"numeric_column\").mean().alias(\"avg_value\"),\n", "# pl.col(\"numeric_column\").count().alias(\"count\")\n", "# ])\n", "\n", "# # Create new columns\n", "# df_with_new_col = df.with_columns([\n", "# (pl.col(\"col1\") / pl.col(\"col2\")).alias(\"ratio\"),\n", "# pl.col(\"date_column\").dt.year().alias(\"year\")\n", "# ])\n", "\n", "print(\"Transform your data using Polars expressions above\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Export Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save processed data\n", "# df.write_parquet(\"processed_data.parquet\")\n", "# df.write_csv(\"processed_data.csv\")\n", "\n", "print(\"Analysis complete!\")\n", "print(f\"Final dataset shape: {df.shape}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Quick Reference: Common Polars Operations\n", "\n", "```python\n", "# Loading data\n", "df = pl.read_csv(\"file.csv\")\n", "df = pl.read_parquet(\"file.parquet\")\n", "df = pl.read_excel(\"file.xlsx\")\n", "\n", "# Basic operations\n", "df.head(n) # First n rows\n", "df.tail(n) # Last n rows\n", "df.shape # Dimensions\n", "df.columns # Column names\n", "df.dtypes # Data types\n", "df.describe() # Statistical summary\n", "\n", "# Filtering and selection\n", "df.filter(pl.col(\"col\") > 10)\n", "df.select([\"col1\", \"col2\"])\n", "df.drop([\"col1\", \"col2\"])\n", "\n", "# Grouping and aggregation\n", "df.group_by(\"col\").agg(pl.col(\"val\").sum())\n", "df.group_by([\"col1\", \"col2\"]).count()\n", "\n", "# Transformations\n", "df.with_columns(pl.col(\"col\").alias(\"new_name\"))\n", "df.sort(\"col\", descending=True)\n", "df.unique()\n", "```" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }