martenc · April 7, 2025 06:11 · Apr 7, 2025
diff --git a/numpy_pro_tips.py b/numpy_pro_tips.py
@@ -0,0 +1,230 @@
+# NumPy Pro Tips: Data Analysis & EDA Techniques
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# ======================================================================
+# 1. STATISTICAL FUNCTIONS & AGGREGATION
+# ======================================================================
+
+# Basic descriptive statistics
+data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+print(f"Mean: {np.mean(data)}")
+print(f"Median: {np.median(data)}")
+print(f"Standard Deviation: {np.std(data)}")
+print(f"Variance: {np.var(data)}")
+print(f"Min: {np.min(data)}, Max: {np.max(data)}")
+
+# Percentiles and quantiles
+print(f"25th percentile: {np.percentile(data, 25)}")
+print(f"50th percentile: {np.percentile(data, 50)}")  # Same as median
+print(f"75th percentile: {np.percentile(data, 75)}")
+print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}")
+
+# Multi-dimensional aggregation with axis parameter
+array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+print(f"Row means: {np.mean(array_2d, axis=1)}")  # Mean of each row
+print(f"Column means: {np.mean(array_2d, axis=0)}")  # Mean of each column
+
+# Weighted statistics
+values = np.array([1, 2, 3, 4, 5])
+weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
+weighted_avg = np.average(values, weights=weights)
+print(f"Weighted average: {weighted_avg}")
+
+# Running/cumulative statistics
+print(f"Cumulative sum: {np.cumsum(data)}")
+print(f"Cumulative product: {np.cumprod(data)}")
+print(f"Cumulative max: {np.maximum.accumulate(data)}")
+
+# ======================================================================
+# 2. VECTORIZED DATA CLEANING & TRANSFORMATION
+# ======================================================================
+
+# Handling missing values (NaN)
+data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7])
+print(f"Identify NaN values: {np.isnan(data_with_nan)}")
+print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}")
+print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}")
+
+# Mean imputation for NaN values
+mean_val = np.nanmean(data_with_nan)  # Mean ignoring NaNs
+data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan)
+print(f"After mean imputation: {data_imputed}")
+
+# Outlier detection with Z-scores
+def detect_outliers_zscore(data, threshold=3):
+    """Detect outliers using Z-score method"""
+    z_scores = (data - np.mean(data)) / np.std(data)
+    return np.abs(z_scores) > threshold
+
+# Outlier detection with IQR method
+def detect_outliers_iqr(data, k=1.5):
+    """Detect outliers using IQR method"""
+    q1, q3 = np.percentile(data, [25, 75])
+    iqr = q3 - q1
+    lower_bound = q1 - k * iqr
+    upper_bound = q3 + k * iqr
+    return (data < lower_bound) | (data > upper_bound)
+
+# Min-max normalization
+def min_max_normalize(data):
+    """Scale data to range [0, 1]"""
+    return (data - np.min(data)) / (np.max(data) - np.min(data))
+
+# Z-score standardization
+def standardize(data):
+    """Standardize data to mean=0, std=1"""
+    return (data - np.mean(data)) / np.std(data)
+
+# ======================================================================
+# 3. BROADCASTING & VECTORIZATION TRICKS
+# ======================================================================
+
+# Element-wise operations are automatically vectorized
+a = np.array([1, 2, 3, 4])
+b = np.array([5, 6, 7, 8])
+print(f"a + b: {a + b}")
+print(f"a * b: {a * b}")
+print(f"a ** 2: {a ** 2}")
+print(f"np.log(a): {np.log(a)}")
+
+# Broadcasting with different shapes
+row = np.array([1, 2, 3, 4])
+column = np.array([[10], [20], [30]])
+print(f"Broadcasting result:\n{column + row}")  # 3x4 result
+
+# Conditional logic without loops
+conditions = [data < 3, (data >= 3) & (data < 7), data >= 7]
+choices = [data*2, data, data*0.5]
+result = np.select(conditions, choices, default=data)
+print(f"Conditional result: {result}")
+
+# Fast replacement with where
+numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
+result = np.where(numbers % 2 == 0, "even", "odd")
+print(f"Even/odd labels: {result}")
+
+# ======================================================================
+# 4. ADVANCED INDEXING & FILTERING
+# ======================================================================
+
+# Boolean masking
+array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+mask = array > 5
+print(f"Filtered array: {array[mask]}")
+
+# Multiple condition filtering with logical operators
+mask2 = (array > 3) & (array < 8)
+print(f"Complex filter: {array[mask2]}")
+
+# Fancy indexing with integer arrays
+indices = np.array([0, 2, 5, 7])
+print(f"Fancy indexing: {array[indices]}")
+
+# Combining boolean masking and fancy indexing
+filtered_indices = np.where(array > 5)[0]  # Returns indices where condition is True
+print(f"Indices where array > 5: {filtered_indices}")
+print(f"Values where array > 5: {array[filtered_indices]}")
+
+# ======================================================================
+# 5. EFFICIENT COMPUTATION TECHNIQUES
+# ======================================================================
+
+# Element-wise comparison with tolerance
+a = np.array([0.1, 0.2, 0.3])
+b = np.array([0.10000001, 0.2, 0.30000001])
+print(f"Exact equality: {a == b}")
+print(f"Equality with tolerance: {np.isclose(a, b)}")
+print(f"All close: {np.allclose(a, b)}")
+
+# Optimized linear algebra
+A = np.array([[1, 2], [3, 4]])
+B = np.array([[5, 6], [7, 8]])
+print(f"Matrix multiplication:\n{np.dot(A, B)}")  # or A @ B in Python 3.5+
+print(f"Matrix inverse:\n{np.linalg.inv(A)}")
+print(f"Determinant: {np.linalg.det(A)}")
+print(f"Eigenvalues: {np.linalg.eigvals(A)}")
+
+# Fast set operations
+a = np.array([1, 2, 3, 4, 5])
+b = np.array([4, 5, 6, 7, 8])
+print(f"Unique values: {np.unique(np.concatenate((a, b)))}")
+print(f"Intersection: {np.intersect1d(a, b)}")
+print(f"Union: {np.union1d(a, b)}")
+print(f"In a but not in b: {np.setdiff1d(a, b)}")
+print(f"In either a or b but not both: {np.setxor1d(a, b)}")
+
+# ======================================================================
+# 6. MEMORY EFFICIENCY & PERFORMANCE TIPS
+# ======================================================================
+
+# View vs copy
+original = np.arange(10)
+view = original[2:5]  # Creates a view - changes affect original
+view[0] = 99
+print(f"Original after modifying view: {original}")
+
+copy = original[2:5].copy()  # Creates a copy - changes don't affect original
+copy[0] = 88
+print(f"Original after modifying copy: {original}")
+
+# Memory-efficient dtype selection
+int_array = np.arange(1000)
+int_small = np.arange(1000, dtype=np.int16)  # Use smaller integer type
+print(f"Memory usage int_array: {int_array.nbytes} bytes")
+print(f"Memory usage int_small: {int_small.nbytes} bytes")
+
+# Pre-allocate arrays for performance
+def efficient_preallocate():
+    result = np.zeros(1000)
+    for i in range(1000):
+        result[i] = i  # Modifies existing array
+    return result
+
+# Use ufuncs for faster aggregation
+large_array = np.random.rand(1000000)
+# Fast:
+result = np.sum(large_array)
+# Slower:
+# result = sum(large_array)
+
+# ======================================================================
+# 7. DATA EXPLORATION HELPERS
+# ======================================================================
+
+# Generate descriptive statistics report
+def describe_numpy_array(arr):
+    """Generate descriptive statistics for numpy array"""
+    return {
+        'shape': arr.shape,
+        'size': arr.size,
+        'dim': arr.ndim,
+        'dtype': arr.dtype,
+        'min': np.min(arr),
+        'max': np.max(arr),
+        'mean': np.mean(arr),
+        'median': np.median(arr),
+        'std': np.std(arr),
+        'var': np.var(arr),
+        'Q1': np.percentile(arr, 25),
+        'Q3': np.percentile(arr, 75),
+        'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0,
+        'unique_values': np.unique(arr).size,
+        'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None
+    }
+
+# Find patterns in data with FFT
+def find_frequencies(time_series):
+    """Find dominant frequencies in time series data"""
+    fft_output = np.fft.fft(time_series)
+    power = np.abs(fft_output)
+    freq = np.fft.fftfreq(len(time_series))
+    dominant_freq_idx = np.argmax(power[1:]) + 1
+    return {
+        'dominant_freq': freq[dominant_freq_idx],
+        'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf,
+        'fft_power': power,
+        'frequencies': freq
+    }