Skip to content

Instantly share code, notes, and snippets.

@erickt
Created February 9, 2017 16:57
Show Gist options
  • Select an option

  • Save erickt/01e2c6cdb9c936bc9ffe371acd0cb38e to your computer and use it in GitHub Desktop.

Select an option

Save erickt/01e2c6cdb9c936bc9ffe371acd0cb38e to your computer and use it in GitHub Desktop.

Revisions

  1. erickt created this gist Feb 9, 2017.
    206 changes: 206 additions & 0 deletions analyze.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,206 @@
    import csv
    import itertools
    import sys
    from collections import defaultdict
    from operator import itemgetter

    import networkx as nx
    import matplotlib.pyplot as plt
    from networkx.drawing.nx_agraph import graphviz_layout


    def parse(filename, fieldnames):
    with open(filename) as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
    if row[0] == '\\.':
    break

    d = {}
    for fieldname, value in itertools.izip(fieldnames, row):
    d[fieldname] = value

    yield d


    def purge_disconnected(G):
    print 'purging disconnected crates...'

    for crate, rank in G.degree().iteritems():
    if rank <= 2:
    G.remove_node(crate)

    print 'nodes:', G.number_of_nodes()
    print 'edges:', G.number_of_edges()


    def purge_low_pagerank(G):
    print 'keep the top 1000 pagerank crates'

    pr = nx.pagerank(G, alpha=0.9)
    pr = sorted(pr.items(), key=itemgetter(1), reverse=True)
    for crate, rank in pr[1000:]:
    G.remove_node(crate)

    print 'nodes:', G.number_of_nodes()
    print 'edges:', G.number_of_edges()


    def main():
    G = nx.Graph()

    print 'loading crates...'
    crates = parse(
    filename='2510.dat',
    fieldnames=[
    'id',
    'name',
    'updated_at',
    'created_at',
    'downloads',
    'max_version',
    'description',
    'homepage',
    'documentation',
    'readme',
    'textsearchable_index_col',
    'license',
    'repository',
    'max_upload_size',
    ])

    crate_id_to_crate = {}
    crate_name_to_crate = {}
    for crate in crates:
    crate['id'] = int(crate['id'])

    crate_id_to_crate[crate['id']] = crate
    crate_name_to_crate[crate['name']] = crate

    G.add_node(crate['name'])

    print 'loading versions...'
    versions = parse(
    filename='2514.dat',
    fieldnames=[
    'id',
    'crate_id',
    'num',
    'updated_at',
    'created_at',
    'downloads',
    'features',
    'yanked',
    ])

    version_id_to_version = {}
    crate_id_to_version = defaultdict(list)
    for version in versions:
    version['id'] = int(version['id'])
    version['crate_id'] = int(version['crate_id'])

    version_id_to_version[version['id']] = version
    crate_id_to_version[version['crate_id']].append(version)

    print 'loading dependencies...'
    dependencies = parse(
    filename='2511.dat',
    fieldnames=[
    'id',
    'version_id',
    'crate_id',
    'req',
    'optional',
    'default_features',
    'features',
    'target',
    'kind',
    ])

    dependency_id_to_dependency = {}
    version_id_to_dependency = defaultdict(list)
    crate_id_to_dependency = defaultdict(list)
    for dependency in dependencies:
    dependency['id'] = int(dependency['id'])
    dependency['crate_id'] = int(dependency['crate_id'])
    dependency['version_id'] = int(dependency['version_id'])

    dependency_id_to_dependency[dependency['id']] = dependency
    version_id_to_dependency[dependency['version_id']].append(dependency)
    crate_id_to_dependency[dependency['crate_id']].append(dependency)

    src_id = crate_id_to_crate[dependency['crate_id']]['name']
    dst_id = crate_id_to_crate[
    version_id_to_version[dependency['version_id']]['crate_id']
    ]['name']

    G.add_edge(dst_id, src_id)

    print 'nodes:', G.number_of_nodes()
    print 'edges:', G.number_of_edges()

    ####

    print 'calculating degree centrality...'
    degree = nx.degree_centrality(G)

    with open('degree.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(
    sorted(degree.items(), key=itemgetter(1), reverse=True))

    print 'calculating pagerank...'
    pr = nx.pagerank(G, alpha=0.9)

    with open('pagerank.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(
    sorted(pr.items(), key=itemgetter(1), reverse=True))

    ####
    # Betweenness and closeness are really expensive, so purge out a lot of
    # nodes

    purge_disconnected(G)
    purge_low_pagerank(G)
    purge_disconnected(G)

    print 'calculating betweenness...'
    betweenness = nx.betweenness_centrality(G)

    with open('betweenness.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(
    sorted(betweenness.items(), key=itemgetter(1), reverse=True))

    print 'calculating closeness...'
    closeness = nx.closeness_centrality(G)

    with open('closeness.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(
    sorted(closeness.items(), key=itemgetter(1), reverse=True))

    return

    max_rank = max(betweenness.itervalues())
    colors = [betweenness[crate] / max_rank for crate in G.nodes_iter()]

    pos = graphviz_layout(G, prog='twopi', args='')
    plt.figure(figsize=(16, 8))
    nx.draw(
    G,
    pos,
    node_size=100,
    alpha=0.9,
    node_color=colors,
    node_cmap=plt.cm.inferno,
    edge_color='#A0CBE2',
    with_labels=True)
    plt.axis('equal')
    plt.savefig('deps.png')
    plt.show()


    if __name__ == '__main__':
    sys.exit(main())