In [52]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
import requests

In [118]:
project_id = "CHANGEME"
github_user = "CHANGEME"
github_token = "CHANGEME" # from https://github.com/settings/tokens

In [62]:
github_auth = requests.auth.HTTPBasicAuth(github_user, github_token)

In [135]:
query = """
WITH stars AS (
     SELECT actor.login AS user, repo.name AS repo, created_at AS timestamp
     FROM githubarchive.month.201706
     WHERE type="WatchEvent"
),
repositories_stars AS (
     SELECT repo, COUNT(*) as c
     FROM stars
     GROUP BY repo
     ORDER BY c DESC
     LIMIT 1000
),
users_stars AS (
    SELECT user, COUNT(*) as c
    FROM  stars
    WHERE repo IN (SELECT repo FROM repositories_stars)
    GROUP BY user
    HAVING c > 10 AND C < 100
    LIMIT 10000
)
SELECT
user, repo, timestamp
FROM stars
WHERE repo IN (SELECT repo FROM repositories_stars)
AND user IN (SELECT user FROM users_stars)
ORDER BY timestamp DESC
"""

data = pd.io.gbq.read_gbq(query, index_col="timestamp", dialect="standard", project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 78238 rows.

Total time taken 6.51 s.
Finished at 2017-06-24 09:08:00.


In [136]:
data.head()

Unnamed: 0_level_0,user,repo
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-23 23:57:04,n3tn0de,webkul/coolhue
2017-06-23 23:55:08,psw0714,justjavac/free-programming-books-zh_CN
2017-06-23 23:54:36,psw0714,ecomfe/echarts
2017-06-23 23:54:21,psw0714,tastejs/todomvc
2017-06-23 23:54:14,psw0714,babel/babel


In [140]:
# map each repo and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['repo'] = data['repo'].astype("category")

# create a sparse matrix of all the users/repos
stars = coo_matrix((np.ones(data.shape[0]),
                   (data['repo'].cat.codes.copy(),
                    data['user'].cat.codes.copy())))

In [141]:
stars

<999x4348 sparse matrix of type '<type 'numpy.float64'>'
	with 78238 stored elements in COOrdinate format>

In [142]:
model = AlternatingLeastSquares(factors=50,
                                regularization=0.01,
                                dtype=np.float64,
                                iterations=50)

In [144]:
confidence = 40
model.fit(confidence * stars)

In [126]:
repos = dict(enumerate(data['repo'].cat.categories))
repo_ids = {r: i for i, r in repos.iteritems()}

In [127]:
[(repos[r], s) for r, s in model.similar_items(repo_ids['tensorflow/tensorflow'])]

[(u'tensorflow/tensorflow', 1.0000000000000004),
 (u'jikexueyuanwiki/tensorflow-zh', 0.52015405760492706),
 (u'BVLC/caffe', 0.4161581732982037),
 (u'scikit-learn/scikit-learn', 0.40543551306117309),
 (u'google/protobuf', 0.40160716582156247),
 (u'fchollet/keras', 0.39897590674119598),
 (u'shadowsocksr/shadowsocksr-csharp', 0.3798671235574328),
 (u'ethereum/mist', 0.37205191726130321),
 (u'pandas-dev/pandas', 0.34311692603549021),
 (u'karpathy/char-rnn', 0.33868380215281335)]

In [129]:
def user_stars(user):
    repos = []
    url = "https://api.github.com/users/{}/starred".format(user)
    while url:
        resp = requests.get(url, auth=github_auth)
        repos += [r["full_name"] for r in resp.json()]
        url = resp.links["next"]["url"] if "next" in resp.links else None
    return repos

def user_items(u_stars):
    star_ids = [repo_ids[s] for s in u_stars if s in repo_ids]
    data = [confidence for _ in star_ids]
    rows = [0 for _ in star_ids]
    shape = (1, model.item_factors.shape[0])
    return coo_matrix((data, (rows, star_ids)), shape=shape).tocsr()

In [130]:
jbochi = user_items(user_stars("jbochi"))

In [131]:
def recommend(user_items):
    recs = model.recommend(userid=0, user_items=user_items, recalculate_user=True)
    return [(repos[r], s) for r, s in recs]

def explain(user_items, repo):
    _, recs, _ = model.explain(userid=0, user_items=user_items, itemid=repo_ids[repo])
    return [(repos[r], s) for r, s in recs]

In [132]:
recommend(jbochi)

[(u'ansible/ansible', 1.3480146093553365),
 (u'airbnb/superset', 1.337698670756992),
 (u'scrapy/scrapy', 1.2682612609169515),
 (u'grpc/grpc', 1.1558718295721062),
 (u'scikit-learn/scikit-learn', 1.1539551159232055),
 (u'grafana/grafana', 1.1265144087278358),
 (u'google/protobuf', 1.078458167396922),
 (u'lodash/lodash', 1.0690341693223879),
 (u'josephmisiti/awesome-machine-learning', 1.0553796439629786),
 (u'd3/d3', 1.0546232373207065)]

In [133]:
explain(jbochi, 'fchollet/keras')

[(u'pandas-dev/pandas', 0.18368079727509334),
 (u'BVLC/caffe', 0.15726607611115795),
 (u'requests/requests', 0.15263841163355341),
 (u'pallets/flask', 0.15259412774463132),
 (u'robbyrussell/oh-my-zsh', 0.1503775470984523),
 (u'apache/spark', 0.12771260655405856),
 (u'tensorflow/tensorflow', 0.12343847633950071),
 (u'kripken/emscripten', 0.12294875917036562),
 (u'videojs/video.js', 0.12279727716802587),
 (u'rust-lang/rust', 0.10859551238691327)]