{ "cells": [ { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from scipy.sparse import coo_matrix\n", "from implicit.als import AlternatingLeastSquares\n", "import requests" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": true }, "outputs": [], "source": [ "project_id = \"CHANGEME\"\n", "github_user = \"CHANGEME\"\n", "github_token = \"CHANGEME\" # from https://github.com/settings/tokens" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [], "source": [ "github_auth = requests.auth.HTTPBasicAuth(github_user, github_token)" ] }, { "cell_type": "code", "execution_count": 135, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Cache hit.\n", "\n", "Retrieving results...\n", "Got 78238 rows.\n", "\n", "Total time taken 6.51 s.\n", "Finished at 2017-06-24 09:08:00.\n" ] } ], "source": [ "query = \"\"\"\n", "WITH stars AS (\n", " SELECT actor.login AS user, repo.name AS repo, created_at AS timestamp\n", " FROM githubarchive.month.201706\n", " WHERE type=\"WatchEvent\"\n", "),\n", "repositories_stars AS (\n", " SELECT repo, COUNT(*) as c\n", " FROM stars\n", " GROUP BY repo\n", " ORDER BY c DESC\n", " LIMIT 1000\n", "),\n", "users_stars AS (\n", " SELECT user, COUNT(*) as c\n", " FROM stars\n", " WHERE repo IN (SELECT repo FROM repositories_stars)\n", " GROUP BY user\n", " HAVING c > 10 AND C < 100\n", " LIMIT 10000\n", ")\n", "SELECT\n", "user, repo, timestamp\n", "FROM stars\n", "WHERE repo IN (SELECT repo FROM repositories_stars)\n", "AND user IN (SELECT user FROM users_stars)\n", "ORDER BY timestamp DESC\n", "\"\"\"\n", "\n", "data = pd.io.gbq.read_gbq(query, index_col=\"timestamp\", dialect=\"standard\", project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 136, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userrepo
timestamp
2017-06-23 23:57:04n3tn0dewebkul/coolhue
2017-06-23 23:55:08psw0714justjavac/free-programming-books-zh_CN
2017-06-23 23:54:36psw0714ecomfe/echarts
2017-06-23 23:54:21psw0714tastejs/todomvc
2017-06-23 23:54:14psw0714babel/babel
\n", "
" ], "text/plain": [ " user repo\n", "timestamp \n", "2017-06-23 23:57:04 n3tn0de webkul/coolhue\n", "2017-06-23 23:55:08 psw0714 justjavac/free-programming-books-zh_CN\n", "2017-06-23 23:54:36 psw0714 ecomfe/echarts\n", "2017-06-23 23:54:21 psw0714 tastejs/todomvc\n", "2017-06-23 23:54:14 psw0714 babel/babel" ] }, "execution_count": 136, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 140, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# map each repo and user to a unique numeric value\n", "data['user'] = data['user'].astype(\"category\")\n", "data['repo'] = data['repo'].astype(\"category\")\n", "\n", "# create a sparse matrix of all the users/repos\n", "stars = coo_matrix((np.ones(data.shape[0]),\n", " (data['repo'].cat.codes.copy(),\n", " data['user'].cat.codes.copy())))" ] }, { "cell_type": "code", "execution_count": 141, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<999x4348 sparse matrix of type ''\n", "\twith 78238 stored elements in COOrdinate format>" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stars" ] }, { "cell_type": "code", "execution_count": 142, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model = AlternatingLeastSquares(factors=50,\n", " regularization=0.01,\n", " dtype=np.float64,\n", " iterations=50)" ] }, { "cell_type": "code", "execution_count": 144, "metadata": { "collapsed": false }, "outputs": [], "source": [ "confidence = 40\n", "model.fit(confidence * stars)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "collapsed": false }, "outputs": [], "source": [ "repos = dict(enumerate(data['repo'].cat.categories))\n", "repo_ids = {r: i for i, r in repos.iteritems()}" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(u'tensorflow/tensorflow', 1.0000000000000004),\n", " (u'jikexueyuanwiki/tensorflow-zh', 0.52015405760492706),\n", " (u'BVLC/caffe', 0.4161581732982037),\n", " (u'scikit-learn/scikit-learn', 0.40543551306117309),\n", " (u'google/protobuf', 0.40160716582156247),\n", " (u'fchollet/keras', 0.39897590674119598),\n", " (u'shadowsocksr/shadowsocksr-csharp', 0.3798671235574328),\n", " (u'ethereum/mist', 0.37205191726130321),\n", " (u'pandas-dev/pandas', 0.34311692603549021),\n", " (u'karpathy/char-rnn', 0.33868380215281335)]" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[(repos[r], s) for r, s in model.similar_items(repo_ids['tensorflow/tensorflow'])]" ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def user_stars(user):\n", " repos = []\n", " url = \"https://api.github.com/users/{}/starred\".format(user)\n", " while url:\n", " resp = requests.get(url, auth=github_auth)\n", " repos += [r[\"full_name\"] for r in resp.json()]\n", " url = resp.links[\"next\"][\"url\"] if \"next\" in resp.links else None\n", " return repos\n", "\n", "def user_items(u_stars):\n", " star_ids = [repo_ids[s] for s in u_stars if s in repo_ids]\n", " data = [confidence for _ in star_ids]\n", " rows = [0 for _ in star_ids]\n", " shape = (1, model.item_factors.shape[0])\n", " return coo_matrix((data, (rows, star_ids)), shape=shape).tocsr()" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "collapsed": false }, "outputs": [], "source": [ "jbochi = user_items(user_stars(\"jbochi\"))" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def recommend(user_items):\n", " recs = model.recommend(userid=0, user_items=user_items, recalculate_user=True)\n", " return [(repos[r], s) for r, s in recs]\n", "\n", "def explain(user_items, repo):\n", " _, recs, _ = model.explain(userid=0, user_items=user_items, itemid=repo_ids[repo])\n", " return [(repos[r], s) for r, s in recs]" ] }, { "cell_type": "code", "execution_count": 132, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[(u'ansible/ansible', 1.3480146093553365),\n", " (u'airbnb/superset', 1.337698670756992),\n", " (u'scrapy/scrapy', 1.2682612609169515),\n", " (u'grpc/grpc', 1.1558718295721062),\n", " (u'scikit-learn/scikit-learn', 1.1539551159232055),\n", " (u'grafana/grafana', 1.1265144087278358),\n", " (u'google/protobuf', 1.078458167396922),\n", " (u'lodash/lodash', 1.0690341693223879),\n", " (u'josephmisiti/awesome-machine-learning', 1.0553796439629786),\n", " (u'd3/d3', 1.0546232373207065)]" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recommend(jbochi)" ] }, { "cell_type": "code", "execution_count": 133, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(u'pandas-dev/pandas', 0.18368079727509334),\n", " (u'BVLC/caffe', 0.15726607611115795),\n", " (u'requests/requests', 0.15263841163355341),\n", " (u'pallets/flask', 0.15259412774463132),\n", " (u'robbyrussell/oh-my-zsh', 0.1503775470984523),\n", " (u'apache/spark', 0.12771260655405856),\n", " (u'tensorflow/tensorflow', 0.12343847633950071),\n", " (u'kripken/emscripten', 0.12294875917036562),\n", " (u'videojs/video.js', 0.12279727716802587),\n", " (u'rust-lang/rust', 0.10859551238691327)]" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "explain(jbochi, 'fchollet/keras')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }