haldean · October 21, 2015 21:45 · Oct 21, 2015 · Oct 6, 2015
diff --git a/authordensity.py b/authordensity.py
@@ -26,6 +26,7 @@
 
 file_densities = dict()
 authors_found = set()
+total_freq = collections.defaultdict(lambda: 0)
 
 for f in sys.argv[1:]:
     if not match_files.search(f):
@@ -46,6 +47,7 @@
             author = synonyms[author]
         authors_found.add(author)
         authors_freq[author] += 1
+        total_freq[author] += 1
     total_lines = len(author_lines)
     authors = {author: author_lines / total_lines
                for author, author_lines in authors_freq.iteritems()}
@@ -60,3 +62,9 @@
     print("\n".join("%3.0f%%\t%s" % (100. * s[0], s[1])
                     for s in author_densities[:count]))
     print()
+
+total_lines = sum(total_freq.values())
+records = total_freq.items()
+records.sort(key=lambda p: (p[1], p[0]), reverse=True)
+for author, freq in records:
+    print("%s\t%.3f%%\t\t%s" % (freq, 100. * freq / total_lines, author))
diff --git a/authordensity.py b/authordensity.py
@@ -0,0 +1,62 @@
+"""
+Most useful as:
+
+    git ls-tree --name-only -r HEAD | xargs python /path/to/authordensity.py
+    
+When run from the root of your git repository. If people show up under
+multiple names, use the synonyms dict to map their aliases to a canonical
+name.
+
+Needs no external libs.
+"""
+
+from __future__ import division, print_function
+
+import collections
+import re
+import subprocess
+import sys
+
+synonyms = {
+}
+
+match_files = re.compile(r"\.(cc|h|cpp|hpp|c|py|pxi|pyx)$")
+author_line_re = re.compile("^author ")
+count = 30
+
+file_densities = dict()
+authors_found = set()
+
+for f in sys.argv[1:]:
+    if not match_files.search(f):
+        continue
+    try:
+        blame = subprocess.check_output(
+            ["git", "blame", "--line-porcelain", f], stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError as e:
+        if e.returncode == 128:
+            continue
+        raise
+    blame_lines = blame.splitlines()
+    author_lines = filter(lambda l: author_line_re.match(l), blame_lines)
+    authors_freq = collections.defaultdict(lambda: 0)
+    for line in author_lines:
+        author = line.split(" ", 1)[1]
+        if author in synonyms:
+            author = synonyms[author]
+        authors_found.add(author)
+        authors_freq[author] += 1
+    total_lines = len(author_lines)
+    authors = {author: author_lines / total_lines
+               for author, author_lines in authors_freq.iteritems()}
+    file_densities[f] = authors
+
+for author in authors_found:
+    author_densities = [(file_density.get(author, 0), fname)
+                        for fname, file_density in file_densities.iteritems()
+                        if file_density.get(author)]
+    author_densities.sort(reverse=True)
+    print("\n%s" % author)
+    print("\n".join("%3.0f%%\t%s" % (100. * s[0], s[1])
+                    for s in author_densities[:count]))
+    print()