{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t0kGpJIhciXH", "outputId": "e390920e-8238-4994-9a25-aea6bf8f9784" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[-1.44444444 42.77777778 -1. 0.55555556 -0.77777778 9.44444444\n", " -0.33333333 0.33333333 -1. 20.55555556]\n" ] } ], "source": [ "import numpy as np\n", "\n", "# Sample data with outliers\n", "data = np.array([1, 200, 3, 10, 4, 50, 6, 9, 3, 100])\n", "\n", "# One-liner: Robust scaling using MAD\n", "scaled = (data - np.median(data)) / np.median(np.abs(data - np.median(data)))\n", "print(scaled)\n" ] }, { "cell_type": "code", "source": [ "# Sample continuous data (e.g., customer ages)\n", "ages = np.array([18, 25, 35, 22, 45, 67, 23, 29, 34, 56, 41, 38, 52, 28, 33])\n", "\n", "# One-liner: Create 4 equal-frequency bins\n", "binned = np.digitize(ages, np.percentile(ages, [25, 50, 75])) - 1\n", "print(binned)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MZ654h7ec34I", "outputId": "144564d7-b48c-4bf2-c034-b131fbaeeefa" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[-1 -1 1 -1 2 2 -1 0 1 2 1 1 2 0 0]\n" ] } ] }, { "cell_type": "code", "source": [ "# Original features (e.g., temperature, humidity)\n", "X = np.array([[20, 65], [25, 70], [30, 45], [22, 80]])\n", "\n", "# One-liner: Generate degree-2 polynomial features\n", "poly_features = np.column_stack([X[:, [i, j]].prod(axis=1) for i in range(X.shape[1]) for j in range(i, X.shape[1])])\n", "print(poly_features)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kzwwApnQc9pV", "outputId": "11249482-96eb-47cc-db05-73bf1b620d27" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[ 400 1300 4225]\n", " [ 625 1750 4900]\n", " [ 900 1350 2025]\n", " [ 484 1760 6400]]\n" ] } ] }, { "cell_type": "code", "source": [ "# Time series data (e.g., daily sales)\n", "sales = np.array([100, 98, 120,130, 74, 145, 110, 140, 65, 105, 135])\n", "\n", "lags = np.column_stack([np.roll(sales, shift) for shift in [1, 2, 3]])[3:]\n", "print(lags)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5XTtCYpQdfML", "outputId": "9bbe8c29-c1e2-4cbb-be7d-00b77bfe89b7" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[120 98 100]\n", " [130 120 98]\n", " [ 74 130 120]\n", " [145 74 130]\n", " [110 145 74]\n", " [140 110 145]\n", " [ 65 140 110]\n", " [105 65 140]]\n" ] } ] }, { "cell_type": "code", "source": [ "# Categorical data (e.g., product categories)\n", "categories = np.array([0, 1, 2, 1, 0, 2, 3, 1])\n", "\n", "# One-liner: One-hot encode\n", "one_hot = (categories[:, None] == np.arange(categories.max() + 1)).astype(int)\n", "print(one_hot)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R0HPVkzmdgvW", "outputId": "a7a1e5b3-9dac-46c3-90bb-14915619a3dd" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[1 0 0 0]\n", " [0 1 0 0]\n", " [0 0 1 0]\n", " [0 1 0 0]\n", " [1 0 0 0]\n", " [0 0 1 0]\n", " [0 0 0 1]\n", " [0 1 0 0]]\n" ] } ] }, { "cell_type": "code", "source": [ "# Coordinate data\n", "locations = np.array([[40.7128, -74.0060],\n", " [34.0522, -118.2437],\n", " [41.8781, -87.6298],\n", " [29.7604, -95.3698]])\n", "reference = np.array([39.7392, -104.9903])\n", "\n", "# One-liner: Calculate Euclidean distances from reference point\n", "distances = np.sqrt(((locations - reference) ** 2).sum(axis=1))\n", "print(distances)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w0rszMzidyjs", "outputId": "8eb2ff56-f6a1-4b4a-b383-b59770133c49" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[30.99959263 14.42201722 17.4917653 13.86111358]\n" ] } ] }, { "cell_type": "code", "source": [ "# Sample features (e.g., price, quality, brand_score)\n", "features = np.array([[10, 8, 7], [15, 9, 6], [12, 7, 8], [20, 10, 9]])\n", "\n", "# One-liner: Create all pairwise interactions\n", "interactions = np.array([features[:, i] * features[:, j]\n", " for i in range(features.shape[1])\n", " for j in range(i+1, features.shape[1])]).T\n", "print(interactions)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0o8FYjTZd7ey", "outputId": "cd8ce3c1-309b-4c75-d0b6-ac7f602d02fb" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[ 80 70 56]\n", " [135 90 54]\n", " [ 84 96 56]\n", " [200 180 90]]\n" ] } ] }, { "cell_type": "code", "source": [ "# Noisy signal data (e.g., stock prices, sensor readings)\n", "signal = np.array([10, 27, 12, 18, 11, 19, 20, 26, 12, 19, 25, 31, 28])\n", "window_size = 4\n", "\n", "# One-liner: Create rolling mean features\n", "rolling_mean = np.convolve(signal, np.ones(window_size)/window_size, mode='valid')\n", "print(rolling_mean)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "amZgqlEZeAOI", "outputId": "fa48d13f-f98b-43b9-8dd0-a1a66c9c7557" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[16.75 17. 15. 17. 19. 19.25 19.25 20.5 21.75 25.75]\n" ] } ] }, { "cell_type": "code", "source": [ "# Data with potential outliers (e.g., transaction amounts)\n", "amounts = np.array([25, 30, 28, 32, 500, 29, 31, 27, 33, 26])\n", "\n", "# One-liner: Create outlier indicator features\n", "outlier_flags = ((amounts < np.percentile(amounts, 5)) |\n", " (amounts > np.percentile(amounts, 95))).astype(int)\n", "print(outlier_flags)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OiHHeWgreq21", "outputId": "80db2904-677c-4eee-f6b1-2dedadabf2b5" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[1 0 0 0 1 0 0 0 0 0]\n" ] } ] }, { "cell_type": "code", "source": [ "# Categorical data (e.g., product categories)\n", "categories = np.array(['Electronics', 'Books', 'Electronics', 'Clothing',\n", " 'Books', 'Electronics', 'Home', 'Books'])\n", "\n", "# One-liner: Frequency encode\n", "unique_cats, counts = np.unique(categories, return_counts=True)\n", "freq_encoded = np.array([counts[np.where(unique_cats == cat)[0][0]] for cat in categories])\n", "print(freq_encoded)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Nor6f_t-ezhk", "outputId": "1d142d36-f381-4f1a-eff8-162b6b2c5bd1" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[3 3 3 1 3 3 1 3]\n" ] } ] } ] }