Skip to content

Instantly share code, notes, and snippets.

@devdazed
Created November 24, 2015 22:04
Show Gist options
  • Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop.
Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop.

Revisions

  1. Russ Bradberry created this gist Nov 24, 2015.
    151 changes: 151 additions & 0 deletions tombstone_count.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,151 @@
    #!/usr/bin/env python

    """
    Counts the number of tombstones in a keyspace.table and reports the top N highest counts
    tombstone_count.py
    [-h] This help screen
    [--data-dir DATA_DIR] The C* data directory (/var/lib/cassandra/data)
    [--top-k TOP_K] The top number of keys with highest tombstone counts to display.
    keyspace The keyspace that contains the table
    table The table to count tombstones
    """

    from collections import Counter
    import argparse
    import glob
    import json
    import operator
    import subprocess


    def sizeof_fmt(num, suffix='B'):
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
    if abs(num) < 1024.0:
    return "%3.1f%s%s" % (num, unit, suffix)
    num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


    class SSTableReader(object):
    def __init__(self, sstable):
    self._sstable = sstable
    self._proc = None
    self._opened = False
    self.num_bytes = 0

    def __iter__(self):
    return self

    def _read_n(self, n=1):
    self._proc.stdout.read(n)
    self.num_bytes += n

    def _next_object(self):
    buf = []
    for char in self.read():
    if char == '\n':
    continue

    if len(buf) == 0 and char != '{':
    raise ValueError('Invalid JSON Object Start Char: {0} ({1})'.format(char, ord(char)))

    buf.append(char)

    # the object ends with a `}`, so each one we see we try to marshal
    # if the marshal works, the object is complete
    if char == '}':
    try:
    row = json.loads(''.join(buf))
    self._read_n(2) # skip past the next two chars `,\n`
    return row
    except ValueError:
    # if we can't marshal the object, then continue reading
    continue

    def open(self):
    self._proc = subprocess.Popen(['sstable2json', self._sstable], stdout=subprocess.PIPE, bufsize=1048576)

    def read(self):
    for c in iter(lambda: self._proc.stdout.read(1), ''):
    yield c
    self.num_bytes += 1

    def next(self):
    if not self._opened:
    self.open()
    self._opened = True
    self._read_n(2) # skip past the first two chars `[\n`

    next_object = self._next_object()
    if next_object is None:
    raise StopIteration()

    return next_object


    class TombstoneCounter(object):

    def __init__(self, keyspace, table, data_dir):
    self._data_dir = data_dir
    self._keyspace = keyspace
    self._table = table
    self._sstable_count = 0
    self._total_bytes = 0
    self._tombstones = Counter()

    @staticmethod
    def read_sstable_json(sstable):
    print 'Reading {0}'.format(sstable)
    reader = SSTableReader(sstable)
    return reader

    def sstable_files(self):
    tables = glob.glob('{0}/{1}/{2}/*-Data.db'.format(self._data_dir, self._keyspace, self._table))
    self._sstable_count = len(tables)
    print 'Found {0} sstables'.format(self._sstable_count)
    return tables

    def count_tombstones(self):
    for sstable in self.sstable_files():
    self.count_tombstones_in_sstable(sstable)

    def count_tombstones_in_row(self, row):
    for cell in row['cells']:
    if len(cell) > 3 and cell[3] == 't':
    self._tombstones[row['key']] += 1

    def count_tombstones_in_sstable(self, sstable):
    reader = self.read_sstable_json(sstable)
    for row in reader:
    self.count_tombstones_in_row(row)

    self._total_bytes += reader.num_bytes

    def report(self, top):
    sorted_tombstones = sorted(self._tombstones.items(), key=operator.itemgetter(1))
    sorted_tombstones.reverse()
    print 'Read {0} keys and {1} of data'.format(len(sorted_tombstones), sizeof_fmt(self._total_bytes))

    print 'Top {0} keys with highest number tombstones'.format(top)
    n = 0
    for pair in sorted_tombstones[0:top]:
    n += 1
    print "{0:3} {1} => {2}".format(str(n) + '.', pair[0], pair[1])


    def main():
    parser = argparse.ArgumentParser(usage=__doc__)
    parser.add_argument('keyspace')
    parser.add_argument('table')
    parser.add_argument('--data-dir', default='/var/lib/cassandra/data')
    parser.add_argument('--top-k', default=25)

    args = parser.parse_args()

    t = TombstoneCounter(args.keyspace, args.table, args.data_dir)
    t.count_tombstones()
    t.report(args.top_k)

    if __name__ == '__main__':
    main()