import csv import itertools import sys from collections import defaultdict from operator import itemgetter import networkx as nx import matplotlib.pyplot as plt from networkx.drawing.nx_agraph import graphviz_layout def parse(filename, fieldnames): with open(filename) as f: reader = csv.reader(f, delimiter='\t') for row in reader: if row[0] == '\\.': break d = {} for fieldname, value in itertools.izip(fieldnames, row): d[fieldname] = value yield d def purge_disconnected(G): print 'purging disconnected crates...' for crate, rank in G.degree().iteritems(): if rank <= 2: G.remove_node(crate) print 'nodes:', G.number_of_nodes() print 'edges:', G.number_of_edges() def purge_low_pagerank(G): print 'keep the top 1000 pagerank crates' pr = nx.pagerank(G, alpha=0.9) pr = sorted(pr.items(), key=itemgetter(1), reverse=True) for crate, rank in pr[1000:]: G.remove_node(crate) print 'nodes:', G.number_of_nodes() print 'edges:', G.number_of_edges() def main(): G = nx.Graph() print 'loading crates...' crates = parse( filename='2510.dat', fieldnames=[ 'id', 'name', 'updated_at', 'created_at', 'downloads', 'max_version', 'description', 'homepage', 'documentation', 'readme', 'textsearchable_index_col', 'license', 'repository', 'max_upload_size', ]) crate_id_to_crate = {} crate_name_to_crate = {} for crate in crates: crate['id'] = int(crate['id']) crate_id_to_crate[crate['id']] = crate crate_name_to_crate[crate['name']] = crate G.add_node(crate['name']) print 'loading versions...' versions = parse( filename='2514.dat', fieldnames=[ 'id', 'crate_id', 'num', 'updated_at', 'created_at', 'downloads', 'features', 'yanked', ]) version_id_to_version = {} crate_id_to_version = defaultdict(list) for version in versions: version['id'] = int(version['id']) version['crate_id'] = int(version['crate_id']) version_id_to_version[version['id']] = version crate_id_to_version[version['crate_id']].append(version) print 'loading dependencies...' dependencies = parse( filename='2511.dat', fieldnames=[ 'id', 'version_id', 'crate_id', 'req', 'optional', 'default_features', 'features', 'target', 'kind', ]) dependency_id_to_dependency = {} version_id_to_dependency = defaultdict(list) crate_id_to_dependency = defaultdict(list) for dependency in dependencies: dependency['id'] = int(dependency['id']) dependency['crate_id'] = int(dependency['crate_id']) dependency['version_id'] = int(dependency['version_id']) dependency_id_to_dependency[dependency['id']] = dependency version_id_to_dependency[dependency['version_id']].append(dependency) crate_id_to_dependency[dependency['crate_id']].append(dependency) src_id = crate_id_to_crate[dependency['crate_id']]['name'] dst_id = crate_id_to_crate[ version_id_to_version[dependency['version_id']]['crate_id'] ]['name'] G.add_edge(dst_id, src_id) print 'nodes:', G.number_of_nodes() print 'edges:', G.number_of_edges() #### print 'calculating degree centrality...' degree = nx.degree_centrality(G) with open('degree.csv', 'w') as f: writer = csv.writer(f) writer.writerows( sorted(degree.items(), key=itemgetter(1), reverse=True)) print 'calculating pagerank...' pr = nx.pagerank(G, alpha=0.9) with open('pagerank.csv', 'w') as f: writer = csv.writer(f) writer.writerows( sorted(pr.items(), key=itemgetter(1), reverse=True)) #### # Betweenness and closeness are really expensive, so purge out a lot of # nodes purge_disconnected(G) purge_low_pagerank(G) purge_disconnected(G) print 'calculating betweenness...' betweenness = nx.betweenness_centrality(G) with open('betweenness.csv', 'w') as f: writer = csv.writer(f) writer.writerows( sorted(betweenness.items(), key=itemgetter(1), reverse=True)) print 'calculating closeness...' closeness = nx.closeness_centrality(G) with open('closeness.csv', 'w') as f: writer = csv.writer(f) writer.writerows( sorted(closeness.items(), key=itemgetter(1), reverse=True)) return max_rank = max(betweenness.itervalues()) colors = [betweenness[crate] / max_rank for crate in G.nodes_iter()] pos = graphviz_layout(G, prog='twopi', args='') plt.figure(figsize=(16, 8)) nx.draw( G, pos, node_size=100, alpha=0.9, node_color=colors, node_cmap=plt.cm.inferno, edge_color='#A0CBE2', with_labels=True) plt.axis('equal') plt.savefig('deps.png') plt.show() if __name__ == '__main__': sys.exit(main())