import random from typing import ( List, Dict, Callable, Union, ) import numpy Number = Union[int, float] Array = numpy.array # A sample stored by the counts of its unique values. SampleCounts = Dict[Number, int] # unique_value: count ############################# # Estimators ############################# SampleValues = Array # [float, ...] ValueCounts = Array # [int, ...] Estimate = float Estimator = Callable[[SampleValues, ValueCounts], Estimate] def mean_estimator( values: SampleValues, counts: ValueCounts, ) -> Estimate: sum_values = (counts * values).sum() num_values = counts.sum() return sum_values / num_values ############################# # Estimate Quality Assessors ############################# Estimates = Array # [Estimate, ...] EstimateAssessor = Callable[[Estimates], Estimate] def standard_deviation(estimates: Estimates) -> Estimate: return estimates.std() def percentile_05(estimates: Estimates) -> Estimate: return numpy.percentile(estimates, 5) def percentile_95(estimates: Estimates) -> Estimate: return numpy.percentile(estimates, 95) def percentile_NN(nn) -> EstimateAssessor: assert 0 <= nn <= 100 def p_nn(estimates: Estimates) -> Estimate: return numpy.percentile(estimates, nn) return p_nn ############################# # Algorithm ############################# def bag_little_bootstraps( data: SampleCounts, estimator: Estimator, estimate_assessors: List[EstimateAssessor], s=20, r=50, b_power=0.65, ) -> List[Estimate]: ''' Bag of little bootstraps: https://arxiv.org/pdf/1112.5016.pdf The default args should be sane enough defaults for a variety of situations, but if you need to be precise you can optimize them by doing the relative error analysis in the paper with various values. Params data: Input data you sampled from some population estimator: function that calculates estimate (e.g. mean_estimator) estimate_assessors: functions that assesses estimates (e.g. standard_deviation) s: # subsamples r: # bootstrap resamples per subsample b_power: Determines size of subsample ''' data_values = list(data.keys()) # Unique values data_value_counts = list(data.values()) # Unique value counts data_size = sum(data_value_counts) # Determine b # "b" is the subsample size b = int(data_size ** b_power) uniform_subsample_weights = Array([1.0 / b] * b) # Every assessor gets s subset assessments assessor_estimates = [] # [[est_1, est_2, est_s], ...] for _ in estimate_assessors: assessor_estimates.append(Array([0.0] * s)) # Calculate assessor estimates for every subset for s_j in range(s): # Pick a random subset of data # - The size of the sample is "b" # - Sample w/out replacement sample_subset = Array( random.sample( data_values, counts=data_value_counts, k=b, ) ) # Bootstrap resamples # - Calculate an estimate for every resample b_estimates = Array([0.0] * r) for b_k in range(r): # Create a bootstrap resample # - Bootstrap resample size is the size of the original sample, not # the size of the subset sample. b_counts = numpy.random.multinomial( data_size, pvals=uniform_subsample_weights, ) # Calculate estimate of the resampled data b_estimates[b_k] = estimator(sample_subset, b_counts) # Calculate an estimate quality assessment for each assessor for i, estimate_assessor in enumerate(estimate_assessors): assessor_estimates[i][s_j] = estimate_assessor(b_estimates) # Average assessor assessments for each subset to get final estimates final_assessment_estimates = [] for sample_ests in assessor_estimates: final_assessment_estimates.append(sample_ests.mean()) return final_assessment_estimates def mean_characterization_example(): ''' Use BLB to estimate the percentiles of the sampling distribution of the mean. ''' from collections import Counter # Create a sample Uniform(0, 100) # - Discretize it to 0.1 N = 1_000_000 print(f"\nCreating data, size: {N} ...") data = [round(random.random() * 100, 1) for _ in range(N)] data = Counter(data) # Run BLB # - Can swap out mean_estimator for median, ... # - Can swap out percentile_05 for standard_deviation, ... print("BLB Bootstrapping...") estimator = mean_estimator # estimate_assessors = [percentile_05, percentile_95] estimate_assessors = [percentile_NN(x) for x in range(101)] assessor_stats = bag_little_bootstraps( data, estimator=estimator, estimate_assessors=estimate_assessors, ) print(f" 1st percentile: {assessor_stats[1]}") print(f"99th percentile: {assessor_stats[99]}") if __name__ == "__main__": mean_characterization_example()