Created
April 7, 2025 06:11
-
-
Save martenc/2340830591926495f7f99513715ad081 to your computer and use it in GitHub Desktop.
Revisions
-
martenc created this gist
Apr 7, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,230 @@ # NumPy Pro Tips: Data Analysis & EDA Techniques import numpy as np import pandas as pd import matplotlib.pyplot as plt # ====================================================================== # 1. STATISTICAL FUNCTIONS & AGGREGATION # ====================================================================== # Basic descriptive statistics data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) print(f"Mean: {np.mean(data)}") print(f"Median: {np.median(data)}") print(f"Standard Deviation: {np.std(data)}") print(f"Variance: {np.var(data)}") print(f"Min: {np.min(data)}, Max: {np.max(data)}") # Percentiles and quantiles print(f"25th percentile: {np.percentile(data, 25)}") print(f"50th percentile: {np.percentile(data, 50)}") # Same as median print(f"75th percentile: {np.percentile(data, 75)}") print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}") # Multi-dimensional aggregation with axis parameter array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(f"Row means: {np.mean(array_2d, axis=1)}") # Mean of each row print(f"Column means: {np.mean(array_2d, axis=0)}") # Mean of each column # Weighted statistics values = np.array([1, 2, 3, 4, 5]) weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1]) weighted_avg = np.average(values, weights=weights) print(f"Weighted average: {weighted_avg}") # Running/cumulative statistics print(f"Cumulative sum: {np.cumsum(data)}") print(f"Cumulative product: {np.cumprod(data)}") print(f"Cumulative max: {np.maximum.accumulate(data)}") # ====================================================================== # 2. VECTORIZED DATA CLEANING & TRANSFORMATION # ====================================================================== # Handling missing values (NaN) data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7]) print(f"Identify NaN values: {np.isnan(data_with_nan)}") print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}") print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}") # Mean imputation for NaN values mean_val = np.nanmean(data_with_nan) # Mean ignoring NaNs data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan) print(f"After mean imputation: {data_imputed}") # Outlier detection with Z-scores def detect_outliers_zscore(data, threshold=3): """Detect outliers using Z-score method""" z_scores = (data - np.mean(data)) / np.std(data) return np.abs(z_scores) > threshold # Outlier detection with IQR method def detect_outliers_iqr(data, k=1.5): """Detect outliers using IQR method""" q1, q3 = np.percentile(data, [25, 75]) iqr = q3 - q1 lower_bound = q1 - k * iqr upper_bound = q3 + k * iqr return (data < lower_bound) | (data > upper_bound) # Min-max normalization def min_max_normalize(data): """Scale data to range [0, 1]""" return (data - np.min(data)) / (np.max(data) - np.min(data)) # Z-score standardization def standardize(data): """Standardize data to mean=0, std=1""" return (data - np.mean(data)) / np.std(data) # ====================================================================== # 3. BROADCASTING & VECTORIZATION TRICKS # ====================================================================== # Element-wise operations are automatically vectorized a = np.array([1, 2, 3, 4]) b = np.array([5, 6, 7, 8]) print(f"a + b: {a + b}") print(f"a * b: {a * b}") print(f"a ** 2: {a ** 2}") print(f"np.log(a): {np.log(a)}") # Broadcasting with different shapes row = np.array([1, 2, 3, 4]) column = np.array([[10], [20], [30]]) print(f"Broadcasting result:\n{column + row}") # 3x4 result # Conditional logic without loops conditions = [data < 3, (data >= 3) & (data < 7), data >= 7] choices = [data*2, data, data*0.5] result = np.select(conditions, choices, default=data) print(f"Conditional result: {result}") # Fast replacement with where numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) result = np.where(numbers % 2 == 0, "even", "odd") print(f"Even/odd labels: {result}") # ====================================================================== # 4. ADVANCED INDEXING & FILTERING # ====================================================================== # Boolean masking array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) mask = array > 5 print(f"Filtered array: {array[mask]}") # Multiple condition filtering with logical operators mask2 = (array > 3) & (array < 8) print(f"Complex filter: {array[mask2]}") # Fancy indexing with integer arrays indices = np.array([0, 2, 5, 7]) print(f"Fancy indexing: {array[indices]}") # Combining boolean masking and fancy indexing filtered_indices = np.where(array > 5)[0] # Returns indices where condition is True print(f"Indices where array > 5: {filtered_indices}") print(f"Values where array > 5: {array[filtered_indices]}") # ====================================================================== # 5. EFFICIENT COMPUTATION TECHNIQUES # ====================================================================== # Element-wise comparison with tolerance a = np.array([0.1, 0.2, 0.3]) b = np.array([0.10000001, 0.2, 0.30000001]) print(f"Exact equality: {a == b}") print(f"Equality with tolerance: {np.isclose(a, b)}") print(f"All close: {np.allclose(a, b)}") # Optimized linear algebra A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) print(f"Matrix multiplication:\n{np.dot(A, B)}") # or A @ B in Python 3.5+ print(f"Matrix inverse:\n{np.linalg.inv(A)}") print(f"Determinant: {np.linalg.det(A)}") print(f"Eigenvalues: {np.linalg.eigvals(A)}") # Fast set operations a = np.array([1, 2, 3, 4, 5]) b = np.array([4, 5, 6, 7, 8]) print(f"Unique values: {np.unique(np.concatenate((a, b)))}") print(f"Intersection: {np.intersect1d(a, b)}") print(f"Union: {np.union1d(a, b)}") print(f"In a but not in b: {np.setdiff1d(a, b)}") print(f"In either a or b but not both: {np.setxor1d(a, b)}") # ====================================================================== # 6. MEMORY EFFICIENCY & PERFORMANCE TIPS # ====================================================================== # View vs copy original = np.arange(10) view = original[2:5] # Creates a view - changes affect original view[0] = 99 print(f"Original after modifying view: {original}") copy = original[2:5].copy() # Creates a copy - changes don't affect original copy[0] = 88 print(f"Original after modifying copy: {original}") # Memory-efficient dtype selection int_array = np.arange(1000) int_small = np.arange(1000, dtype=np.int16) # Use smaller integer type print(f"Memory usage int_array: {int_array.nbytes} bytes") print(f"Memory usage int_small: {int_small.nbytes} bytes") # Pre-allocate arrays for performance def efficient_preallocate(): result = np.zeros(1000) for i in range(1000): result[i] = i # Modifies existing array return result # Use ufuncs for faster aggregation large_array = np.random.rand(1000000) # Fast: result = np.sum(large_array) # Slower: # result = sum(large_array) # ====================================================================== # 7. DATA EXPLORATION HELPERS # ====================================================================== # Generate descriptive statistics report def describe_numpy_array(arr): """Generate descriptive statistics for numpy array""" return { 'shape': arr.shape, 'size': arr.size, 'dim': arr.ndim, 'dtype': arr.dtype, 'min': np.min(arr), 'max': np.max(arr), 'mean': np.mean(arr), 'median': np.median(arr), 'std': np.std(arr), 'var': np.var(arr), 'Q1': np.percentile(arr, 25), 'Q3': np.percentile(arr, 75), 'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0, 'unique_values': np.unique(arr).size, 'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None } # Find patterns in data with FFT def find_frequencies(time_series): """Find dominant frequencies in time series data""" fft_output = np.fft.fft(time_series) power = np.abs(fft_output) freq = np.fft.fftfreq(len(time_series)) dominant_freq_idx = np.argmax(power[1:]) + 1 return { 'dominant_freq': freq[dominant_freq_idx], 'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf, 'fft_power': power, 'frequencies': freq }