Skip to content

Instantly share code, notes, and snippets.

@peterc
Created May 4, 2025 13:23
Show Gist options
  • Save peterc/43d86f9ed41d41ec50b3f97e96c60691 to your computer and use it in GitHub Desktop.
Save peterc/43d86f9ed41d41ec50b3f97e96c60691 to your computer and use it in GitHub Desktop.

Revisions

  1. peterc created this gist May 4, 2025.
    68 changes: 68 additions & 0 deletions count_min.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,68 @@
    require 'zlib'
    require 'objspace'

    # A variant of https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch

    class CompactApproximator
    attr_accessor :cells

    def initialize(size, hashes, min_elem, join_fn, meet_fn)
    @m, @k = size, hashes
    @cells = Array.new(@k){ Array.new(@m, min_elem) }
    @join, @meet = join_fn, meet_fn
    end

    def add(key, value)
    @k.times do |i|
    j = (Zlib.crc32("#{i}-#{key}") % @m)
    @cells[i][j] = @join.call(@cells[i][j], value)
    end
    end

    def query(key)
    @k.times
    .map { |i| @cells[i][Zlib.crc32("#{i}-#{key}") % @m] }
    .reduce { |a,b| @meet.call(a,b) }
    end
    end

    sample_words = %w[the and to of a in that is it oliver said funeral kwyjibo]

    # accumulate counts, then take min for query
    sketch = CompactApproximator.new(
    2000, 2,
    0,
    ->(a,b){ a + b }, # join function = addition
    ->(a,b){ a < b ? a : b } # meet function = min value
    )

    exact = Hash.new(0)

    # Read the file of Oliver Twist - grab this from wherever
    # or some other large text
    file = 'oliver.txt'

    File.foreach(file) do |line|
    line.downcase.scan(/\w+/).each do |w|
    sketch.add(w, 1)
    exact[w] += 1
    end
    end

    puts "Word exact est error"
    puts "---- ----- --- -----"
    sample_words.each do |w|
    e = exact[w]
    q = sketch.query(w)
    puts "%-7s %6d %6d %6d" % [w, e, q, q - e]
    end

    cells_overhead = ObjectSpace.memsize_of(sketch.cells) +
    sketch.cells.sum { |row|
    ObjectSpace.memsize_of(row) +
    row.sum { |elem| ObjectSpace.memsize_of(elem) }
    }

    puts "Memory usage:"
    puts " Exact hash: #{ObjectSpace.memsize_of(exact)} bytes"
    puts " @cells matrix: #{cells_overhead} bytes"