#include // needed by vacuum.h #include #include #include #include #include #include #include #include #include #include #include #include "measuring.h" #include #include #include #include // All the filters expect uint64 as default input, so we don't bother // generating and hashing strings for this benchmark. std::vector generate_unique_input(size_t N) { static thread_local std::mt19937* rng; if (!rng) { size_t seed = 43; rng = new std::mt19937(seed); } static std::uniform_int_distribution dist; std::unordered_set seen; std::vector result(N); for (int i = 0; i < N; ++i) { uint64_t x; do { x = dist(*rng); result[i] = x; } while (seen.count(x)); seen.insert(x); } return result; } struct XorFacade; // The third argument gives the proportion of filter elements in the input, // i.e. step == 3 => every third element matches => 33% true positive rate template void benchmark_filter(const std::vector& true_input, const std::vector& false_input, double hit_percentage) { size_t S = Filter::ARGUMENT_442_MIB; std::atomic lookups{0}; std::atomic true_positives{0}; std::atomic false_positives{0}; std::atomic done{false}; Filter filter(S); if constexpr (std::is_same_v) { filter.insert_many(true_input); } else { for (auto x : true_input) { filter.insert(x); } } std::thread measurement( tenzir::benchmark::measure_for_n_seconds, 5, &done, std::map*>{ {"lookups", &lookups}, {"false_positives", &false_positives}, {"true_positives", &true_positives}}, [](const std::map& old_values, const std::map& new_values) { if (new_values.empty()) return; auto lookups_delta = new_values.at("lookups") - old_values.at("lookups"); auto tp_delta = new_values.at("true_positives") - old_values.at("true_positives"); auto fp_delta = new_values.at("false_positives") - old_values.at("false_positives"); // We don't include true positives in the error rate calculation, because // they represent unavoidable work. double error_rate = 1. * fp_delta / (lookups_delta - tp_delta); std::cout << "epsilon: " << error_rate << "\n" << "false positives: " << fp_delta << "\n" << "lookups: " << lookups_delta << "\n"; }); size_t true_proportion = 100. * hit_percentage; size_t false_proportion = 100 - true_proportion; size_t false_negatives = 0; size_t true_idx = 0, false_idx = 0; while (!done.load(std::memory_order_relaxed)) { for (int i=0; i 0) { std::cerr << "FALSE NEGATIVES DETECTED: " << false_negatives << "\n"; } measurement.join(); } // All three filters have effectively the same interface, except that the // functions for insertion/lookup have different names. To avoid using macros, // we define these facade classes and hope the compiler will optimize them away. using MortonFilter = CompressedCuckoo::Morton3_8; struct MortonFacade : private MortonFilter { constexpr static size_t ARGUMENT_442_MIB = 33*10'000'000; MortonFacade(size_t S) : MortonFilter(S) { } using MortonFilter::insert; bool lookup(uint64_t x) { return MortonFilter::likely_contains(x); } }; using MortonFilter16 = CompressedCuckoo::Morton7_16; struct Morton716Facade : private MortonFilter16 { constexpr static size_t ARGUMENT_442_MIB = 20*10'000'000; Morton716Facade(size_t S) : MortonFilter16(S) { } using MortonFilter16::insert; bool lookup(uint64_t x) { return MortonFilter16::likely_contains(x); } }; using CuckooFilter = cuckoofilter::CuckooFilter; struct CuckooFacade : private CuckooFilter { constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000; CuckooFacade(size_t S) : CuckooFilter(S){}; void insert(uint64_t x) { CuckooFilter::Add(x); } bool lookup(uint64_t x) { return CuckooFilter::Contain(x) == cuckoofilter::Ok; } }; using CuckooFilter16 = cuckoofilter::CuckooFilter; struct CuckooFacade16 : private CuckooFilter16 { constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000; CuckooFacade16(size_t S) : CuckooFilter16(S){}; void insert(uint64_t x) { if (CuckooFilter16::Add(x) != cuckoofilter::Ok) std::cerr << "ERROR INSERTING\n"; } bool lookup(uint64_t x) { return CuckooFilter16::Contain(x) == cuckoofilter::Ok; } }; struct XorFacade { constexpr static size_t ARGUMENT_442_MIB = 21*10'000'000; XorFacade(size_t S) { xor16_allocate(S, &xf); }; ~XorFacade() { xor16_free(&xf); } // Single-item insert is *really* slow for xor filters void insert_many(const std::vector& x) { xor16_populate(x.data(), x.size(), &xf); } bool lookup(uint64_t x) { return xor16_contain(x, &xf); } xor16_t xf; }; using VacuumFilter_ = VacuumFilter; struct VacuumFacade : private VacuumFilter_ { constexpr static size_t ARGUMENT_442_MIB = 24*10'000'000; VacuumFacade(size_t S) { VacuumFilter_::init(S, 4, 400); }; // max_item_numbers, slots per bucket, max_kick_steps using VacuumFilter_::insert; using VacuumFilter_::lookup; }; using BloomFilter = vast::bloom_filter; struct BloomFacade : private BloomFilter { constexpr static size_t ARGUMENT_442_MIB = 8*46*10'000'000ull; BloomFacade(size_t S): BloomFilter(S) {} using BloomFilter::lookup; void insert(uint64_t x) { BloomFilter::add(x); } }; int main() { for (int i=0; i<6; ++i) { int N = 500'000 + i*10'000'000; std::cout << "# Generating input... " << std::endl; auto true_input = generate_unique_input(N); auto false_input = generate_unique_input(4096ull*4096ull); double hit_percentage = 0.1; std::cout << "# Measuring Vacuum Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Cuckoo Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Morton Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Xor Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Bloom Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Cuckoo16 Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); std::cout << "# Measuring Morton16 Filter\n"; benchmark_filter(true_input, false_input, hit_percentage); } }