Created
          November 24, 2015 22:04 
        
      - 
      
- 
        Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop. 
Revisions
- 
        Russ Bradberry created this gist Nov 24, 2015 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,151 @@ #!/usr/bin/env python """ Counts the number of tombstones in a keyspace.table and reports the top N highest counts tombstone_count.py [-h] This help screen [--data-dir DATA_DIR] The C* data directory (/var/lib/cassandra/data) [--top-k TOP_K] The top number of keys with highest tombstone counts to display. keyspace The keyspace that contains the table table The table to count tombstones """ from collections import Counter import argparse import glob import json import operator import subprocess def sizeof_fmt(num, suffix='B'): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) class SSTableReader(object): def __init__(self, sstable): self._sstable = sstable self._proc = None self._opened = False self.num_bytes = 0 def __iter__(self): return self def _read_n(self, n=1): self._proc.stdout.read(n) self.num_bytes += n def _next_object(self): buf = [] for char in self.read(): if char == '\n': continue if len(buf) == 0 and char != '{': raise ValueError('Invalid JSON Object Start Char: {0} ({1})'.format(char, ord(char))) buf.append(char) # the object ends with a `}`, so each one we see we try to marshal # if the marshal works, the object is complete if char == '}': try: row = json.loads(''.join(buf)) self._read_n(2) # skip past the next two chars `,\n` return row except ValueError: # if we can't marshal the object, then continue reading continue def open(self): self._proc = subprocess.Popen(['sstable2json', self._sstable], stdout=subprocess.PIPE, bufsize=1048576) def read(self): for c in iter(lambda: self._proc.stdout.read(1), ''): yield c self.num_bytes += 1 def next(self): if not self._opened: self.open() self._opened = True self._read_n(2) # skip past the first two chars `[\n` next_object = self._next_object() if next_object is None: raise StopIteration() return next_object class TombstoneCounter(object): def __init__(self, keyspace, table, data_dir): self._data_dir = data_dir self._keyspace = keyspace self._table = table self._sstable_count = 0 self._total_bytes = 0 self._tombstones = Counter() @staticmethod def read_sstable_json(sstable): print 'Reading {0}'.format(sstable) reader = SSTableReader(sstable) return reader def sstable_files(self): tables = glob.glob('{0}/{1}/{2}/*-Data.db'.format(self._data_dir, self._keyspace, self._table)) self._sstable_count = len(tables) print 'Found {0} sstables'.format(self._sstable_count) return tables def count_tombstones(self): for sstable in self.sstable_files(): self.count_tombstones_in_sstable(sstable) def count_tombstones_in_row(self, row): for cell in row['cells']: if len(cell) > 3 and cell[3] == 't': self._tombstones[row['key']] += 1 def count_tombstones_in_sstable(self, sstable): reader = self.read_sstable_json(sstable) for row in reader: self.count_tombstones_in_row(row) self._total_bytes += reader.num_bytes def report(self, top): sorted_tombstones = sorted(self._tombstones.items(), key=operator.itemgetter(1)) sorted_tombstones.reverse() print 'Read {0} keys and {1} of data'.format(len(sorted_tombstones), sizeof_fmt(self._total_bytes)) print 'Top {0} keys with highest number tombstones'.format(top) n = 0 for pair in sorted_tombstones[0:top]: n += 1 print "{0:3} {1} => {2}".format(str(n) + '.', pair[0], pair[1]) def main(): parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument('keyspace') parser.add_argument('table') parser.add_argument('--data-dir', default='/var/lib/cassandra/data') parser.add_argument('--top-k', default=25) args = parser.parse_args() t = TombstoneCounter(args.keyspace, args.table, args.data_dir) t.count_tombstones() t.report(args.top_k) if __name__ == '__main__': main() 
 Russ Bradberry
              created
            
            this gist
            
              Russ Bradberry
              created
            
            this gist