# NumPy Pro Tips: Data Analysis & EDA Techniques import numpy as np import pandas as pd import matplotlib.pyplot as plt # ====================================================================== # 1. STATISTICAL FUNCTIONS & AGGREGATION # ====================================================================== # Basic descriptive statistics data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) print(f"Mean: {np.mean(data)}") print(f"Median: {np.median(data)}") print(f"Standard Deviation: {np.std(data)}") print(f"Variance: {np.var(data)}") print(f"Min: {np.min(data)}, Max: {np.max(data)}") # Percentiles and quantiles print(f"25th percentile: {np.percentile(data, 25)}") print(f"50th percentile: {np.percentile(data, 50)}") # Same as median print(f"75th percentile: {np.percentile(data, 75)}") print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}") # Multi-dimensional aggregation with axis parameter array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(f"Row means: {np.mean(array_2d, axis=1)}") # Mean of each row print(f"Column means: {np.mean(array_2d, axis=0)}") # Mean of each column # Weighted statistics values = np.array([1, 2, 3, 4, 5]) weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1]) weighted_avg = np.average(values, weights=weights) print(f"Weighted average: {weighted_avg}") # Running/cumulative statistics print(f"Cumulative sum: {np.cumsum(data)}") print(f"Cumulative product: {np.cumprod(data)}") print(f"Cumulative max: {np.maximum.accumulate(data)}") # ====================================================================== # 2. VECTORIZED DATA CLEANING & TRANSFORMATION # ====================================================================== # Handling missing values (NaN) data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7]) print(f"Identify NaN values: {np.isnan(data_with_nan)}") print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}") print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}") # Mean imputation for NaN values mean_val = np.nanmean(data_with_nan) # Mean ignoring NaNs data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan) print(f"After mean imputation: {data_imputed}") # Outlier detection with Z-scores def detect_outliers_zscore(data, threshold=3): """Detect outliers using Z-score method""" z_scores = (data - np.mean(data)) / np.std(data) return np.abs(z_scores) > threshold # Outlier detection with IQR method def detect_outliers_iqr(data, k=1.5): """Detect outliers using IQR method""" q1, q3 = np.percentile(data, [25, 75]) iqr = q3 - q1 lower_bound = q1 - k * iqr upper_bound = q3 + k * iqr return (data < lower_bound) | (data > upper_bound) # Min-max normalization def min_max_normalize(data): """Scale data to range [0, 1]""" return (data - np.min(data)) / (np.max(data) - np.min(data)) # Z-score standardization def standardize(data): """Standardize data to mean=0, std=1""" return (data - np.mean(data)) / np.std(data) # ====================================================================== # 3. BROADCASTING & VECTORIZATION TRICKS # ====================================================================== # Element-wise operations are automatically vectorized a = np.array([1, 2, 3, 4]) b = np.array([5, 6, 7, 8]) print(f"a + b: {a + b}") print(f"a * b: {a * b}") print(f"a ** 2: {a ** 2}") print(f"np.log(a): {np.log(a)}") # Broadcasting with different shapes row = np.array([1, 2, 3, 4]) column = np.array([[10], [20], [30]]) print(f"Broadcasting result:\n{column + row}") # 3x4 result # Conditional logic without loops conditions = [data < 3, (data >= 3) & (data < 7), data >= 7] choices = [data*2, data, data*0.5] result = np.select(conditions, choices, default=data) print(f"Conditional result: {result}") # Fast replacement with where numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) result = np.where(numbers % 2 == 0, "even", "odd") print(f"Even/odd labels: {result}") # ====================================================================== # 4. ADVANCED INDEXING & FILTERING # ====================================================================== # Boolean masking array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) mask = array > 5 print(f"Filtered array: {array[mask]}") # Multiple condition filtering with logical operators mask2 = (array > 3) & (array < 8) print(f"Complex filter: {array[mask2]}") # Fancy indexing with integer arrays indices = np.array([0, 2, 5, 7]) print(f"Fancy indexing: {array[indices]}") # Combining boolean masking and fancy indexing filtered_indices = np.where(array > 5)[0] # Returns indices where condition is True print(f"Indices where array > 5: {filtered_indices}") print(f"Values where array > 5: {array[filtered_indices]}") # ====================================================================== # 5. EFFICIENT COMPUTATION TECHNIQUES # ====================================================================== # Element-wise comparison with tolerance a = np.array([0.1, 0.2, 0.3]) b = np.array([0.10000001, 0.2, 0.30000001]) print(f"Exact equality: {a == b}") print(f"Equality with tolerance: {np.isclose(a, b)}") print(f"All close: {np.allclose(a, b)}") # Optimized linear algebra A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) print(f"Matrix multiplication:\n{np.dot(A, B)}") # or A @ B in Python 3.5+ print(f"Matrix inverse:\n{np.linalg.inv(A)}") print(f"Determinant: {np.linalg.det(A)}") print(f"Eigenvalues: {np.linalg.eigvals(A)}") # Fast set operations a = np.array([1, 2, 3, 4, 5]) b = np.array([4, 5, 6, 7, 8]) print(f"Unique values: {np.unique(np.concatenate((a, b)))}") print(f"Intersection: {np.intersect1d(a, b)}") print(f"Union: {np.union1d(a, b)}") print(f"In a but not in b: {np.setdiff1d(a, b)}") print(f"In either a or b but not both: {np.setxor1d(a, b)}") # ====================================================================== # 6. MEMORY EFFICIENCY & PERFORMANCE TIPS # ====================================================================== # View vs copy original = np.arange(10) view = original[2:5] # Creates a view - changes affect original view[0] = 99 print(f"Original after modifying view: {original}") copy = original[2:5].copy() # Creates a copy - changes don't affect original copy[0] = 88 print(f"Original after modifying copy: {original}") # Memory-efficient dtype selection int_array = np.arange(1000) int_small = np.arange(1000, dtype=np.int16) # Use smaller integer type print(f"Memory usage int_array: {int_array.nbytes} bytes") print(f"Memory usage int_small: {int_small.nbytes} bytes") # Pre-allocate arrays for performance def efficient_preallocate(): result = np.zeros(1000) for i in range(1000): result[i] = i # Modifies existing array return result # Use ufuncs for faster aggregation large_array = np.random.rand(1000000) # Fast: result = np.sum(large_array) # Slower: # result = sum(large_array) # ====================================================================== # 7. DATA EXPLORATION HELPERS # ====================================================================== # Generate descriptive statistics report def describe_numpy_array(arr): """Generate descriptive statistics for numpy array""" return { 'shape': arr.shape, 'size': arr.size, 'dim': arr.ndim, 'dtype': arr.dtype, 'min': np.min(arr), 'max': np.max(arr), 'mean': np.mean(arr), 'median': np.median(arr), 'std': np.std(arr), 'var': np.var(arr), 'Q1': np.percentile(arr, 25), 'Q3': np.percentile(arr, 75), 'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0, 'unique_values': np.unique(arr).size, 'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None } # Find patterns in data with FFT def find_frequencies(time_series): """Find dominant frequencies in time series data""" fft_output = np.fft.fft(time_series) power = np.abs(fft_output) freq = np.fft.fftfreq(len(time_series)) dominant_freq_idx = np.argmax(power[1:]) + 1 return { 'dominant_freq': freq[dominant_freq_idx], 'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf, 'fft_power': power, 'frequencies': freq }