Created
May 4, 2025 13:23
-
-
Save peterc/43d86f9ed41d41ec50b3f97e96c60691 to your computer and use it in GitHub Desktop.
Revisions
-
peterc created this gist
May 4, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ require 'zlib' require 'objspace' # A variant of https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch class CompactApproximator attr_accessor :cells def initialize(size, hashes, min_elem, join_fn, meet_fn) @m, @k = size, hashes @cells = Array.new(@k){ Array.new(@m, min_elem) } @join, @meet = join_fn, meet_fn end def add(key, value) @k.times do |i| j = (Zlib.crc32("#{i}-#{key}") % @m) @cells[i][j] = @join.call(@cells[i][j], value) end end def query(key) @k.times .map { |i| @cells[i][Zlib.crc32("#{i}-#{key}") % @m] } .reduce { |a,b| @meet.call(a,b) } end end sample_words = %w[the and to of a in that is it oliver said funeral kwyjibo] # accumulate counts, then take min for query sketch = CompactApproximator.new( 2000, 2, 0, ->(a,b){ a + b }, # join function = addition ->(a,b){ a < b ? a : b } # meet function = min value ) exact = Hash.new(0) # Read the file of Oliver Twist - grab this from wherever # or some other large text file = 'oliver.txt' File.foreach(file) do |line| line.downcase.scan(/\w+/).each do |w| sketch.add(w, 1) exact[w] += 1 end end puts "Word exact est error" puts "---- ----- --- -----" sample_words.each do |w| e = exact[w] q = sketch.query(w) puts "%-7s %6d %6d %6d" % [w, e, q, q - e] end cells_overhead = ObjectSpace.memsize_of(sketch.cells) + sketch.cells.sum { |row| ObjectSpace.memsize_of(row) + row.sum { |elem| ObjectSpace.memsize_of(elem) } } puts "Memory usage:" puts " Exact hash: #{ObjectSpace.memsize_of(exact)} bytes" puts " @cells matrix: #{cells_overhead} bytes"