Last active
August 29, 2015 14:19
-
-
Save Newmu/fd18ffd002af49b9b208 to your computer and use it in GitHub Desktop.
Revisions
-
Newmu revised this gist
Apr 14, 2015 . 1 changed file with 31 additions and 18 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,46 +1,59 @@ import json import numpy as np from time import time from matplotlib import pyplot as plt import random from sklearn import metrics from sklearn.linear_model import LogisticRegression as LR from sklearn.feature_extraction.text import TfidfVectorizer def len_filter(text, max_len=1014): words = text.split(' ') lens = [len(word)+1 for word in words] lens[0] -= 1 lens[-1] -= 1 lens = np.cumsum(lens).tolist() words = [w for w, l in zip(words, lens) if l < 1014] return ' '.join(words) def load(review_json_path, ntrain=10000, ntest=10000): random.seed(42) f = open(review_json_path) pos_text = [] neg_text = [] n = 0 for i, row in enumerate(f): if (min([len(pos_text), len(neg_text)])*2) >= ntest+ntrain: break data = json.loads(row) if data['stars'] != 3: if len(data['text']) >= 100: text = len_filter(data['text']) if data['stars'] > 3: pos_text.append(text) else: neg_text.append(text) if i % 10000 == 0: print i, min([len(pos_text), len(neg_text)])*2 text = random.sample(pos_text, (ntrain+ntest)/2) + random.sample(neg_text, (ntrain+ntest)/2) labels = ([1.] * ((ntrain+ntest)/2)) + ([0.] * ((ntrain+ntest)/2)) idxs = np.arange(len(text)) random.shuffle(idxs) text = [text[idx] for idx in idxs] labels = [labels[idx] for idx in idxs] teX = text[-ntest:] trX = text[:-ntest] teY = labels[-ntest:] trY = labels[:-ntest] return trX, teX, trY, teY if __name__ == "__main__": review_json_path = '/home/alec/datasets/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json' trX, teX, trY, teY = load(review_json_path, ntrain=500000, ntest=50000) t = time() vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9) -
Newmu created this gist
Apr 13, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,61 @@ import json import numpy as np from time import time from sklearn import metrics from sklearn.linear_model import LogisticRegression as LR from sklearn.feature_extraction.text import TfidfVectorizer def load(review_json_path, ntrain=10000, ntest=10000): f = open(review_json_path) text = [] stars = [] funny = [] useful = [] cool = [] n = 0 for i, row in enumerate(f): if n >= ntest+ntrain: break data = json.loads(row) if data['stars'] != 3: stars.append(data['stars'] > 3) text.append(data['text']) funny.append(data['votes']['funny']) useful.append(data['votes']['useful']) cool.append(data['votes']['cool']) n += 1 idxs = np.random.permutation(np.arange(len(text))) text = [text[idx] for idx in idxs] stars = [stars[idx] for idx in idxs] teX = text[-ntest:] trX = text[:-ntest] teY = stars[-ntest:] trY = stars[:-ntest] return trX, teX, trY, teY if __name__ == "__main__": review_json_path = '/home/alec/datasets/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json' trX, teX, trY, teY = load(review_json_path, ntrain=1000000, ntest=100000) t = time() vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9) trX = vect.fit_transform(trX) teX = vect.transform(teX) print trX.shape print 'time to vect', time()-t t = time() model = LR(C=8.) model.fit(trX, trY) print 'time to model', time()-t tr_pred = model.predict(trX) te_pred = model.predict(teX) print metrics.accuracy_score(trY, tr_pred) print metrics.accuracy_score(teY, te_pred)