Skip to content

Instantly share code, notes, and snippets.

@Newmu
Last active August 29, 2015 14:19
Show Gist options
  • Select an option

  • Save Newmu/fd18ffd002af49b9b208 to your computer and use it in GitHub Desktop.

Select an option

Save Newmu/fd18ffd002af49b9b208 to your computer and use it in GitHub Desktop.

Revisions

  1. Newmu revised this gist Apr 14, 2015. 1 changed file with 31 additions and 18 deletions.
    49 changes: 31 additions & 18 deletions yelp_lr_tufs.py
    Original file line number Diff line number Diff line change
    @@ -1,46 +1,59 @@
    import json
    import numpy as np
    from time import time
    from matplotlib import pyplot as plt
    import random

    from sklearn import metrics
    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.feature_extraction.text import TfidfVectorizer

    def len_filter(text, max_len=1014):
    words = text.split(' ')
    lens = [len(word)+1 for word in words]
    lens[0] -= 1
    lens[-1] -= 1
    lens = np.cumsum(lens).tolist()
    words = [w for w, l in zip(words, lens) if l < 1014]
    return ' '.join(words)

    def load(review_json_path, ntrain=10000, ntest=10000):
    random.seed(42)
    f = open(review_json_path)
    text = []
    stars = []
    funny = []
    useful = []
    cool = []
    pos_text = []
    neg_text = []
    n = 0
    for i, row in enumerate(f):
    if n >= ntest+ntrain:
    if (min([len(pos_text), len(neg_text)])*2) >= ntest+ntrain:
    break
    data = json.loads(row)
    if data['stars'] != 3:
    stars.append(data['stars'] > 3)
    text.append(data['text'])
    funny.append(data['votes']['funny'])
    useful.append(data['votes']['useful'])
    cool.append(data['votes']['cool'])
    n += 1

    idxs = np.random.permutation(np.arange(len(text)))
    if len(data['text']) >= 100:
    text = len_filter(data['text'])
    if data['stars'] > 3:
    pos_text.append(text)
    else:
    neg_text.append(text)
    if i % 10000 == 0: print i, min([len(pos_text), len(neg_text)])*2

    text = random.sample(pos_text, (ntrain+ntest)/2) + random.sample(neg_text, (ntrain+ntest)/2)
    labels = ([1.] * ((ntrain+ntest)/2)) + ([0.] * ((ntrain+ntest)/2))
    idxs = np.arange(len(text))
    random.shuffle(idxs)
    text = [text[idx] for idx in idxs]
    stars = [stars[idx] for idx in idxs]
    labels = [labels[idx] for idx in idxs]

    teX = text[-ntest:]
    trX = text[:-ntest]

    teY = stars[-ntest:]
    trY = stars[:-ntest]
    teY = labels[-ntest:]
    trY = labels[:-ntest]

    return trX, teX, trY, teY

    if __name__ == "__main__":
    review_json_path = '/home/alec/datasets/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'
    trX, teX, trY, teY = load(review_json_path, ntrain=1000000, ntest=100000)
    trX, teX, trY, teY = load(review_json_path, ntrain=500000, ntest=50000)

    t = time()
    vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)
  2. Newmu created this gist Apr 13, 2015.
    61 changes: 61 additions & 0 deletions yelp_lr_tufs.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    import json
    import numpy as np
    from time import time

    from sklearn import metrics
    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.feature_extraction.text import TfidfVectorizer

    def load(review_json_path, ntrain=10000, ntest=10000):
    f = open(review_json_path)
    text = []
    stars = []
    funny = []
    useful = []
    cool = []
    n = 0
    for i, row in enumerate(f):
    if n >= ntest+ntrain:
    break
    data = json.loads(row)
    if data['stars'] != 3:
    stars.append(data['stars'] > 3)
    text.append(data['text'])
    funny.append(data['votes']['funny'])
    useful.append(data['votes']['useful'])
    cool.append(data['votes']['cool'])
    n += 1

    idxs = np.random.permutation(np.arange(len(text)))
    text = [text[idx] for idx in idxs]
    stars = [stars[idx] for idx in idxs]

    teX = text[-ntest:]
    trX = text[:-ntest]

    teY = stars[-ntest:]
    trY = stars[:-ntest]

    return trX, teX, trY, teY

    if __name__ == "__main__":
    review_json_path = '/home/alec/datasets/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'
    trX, teX, trY, teY = load(review_json_path, ntrain=1000000, ntest=100000)

    t = time()
    vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)
    trX = vect.fit_transform(trX)
    teX = vect.transform(teX)
    print trX.shape
    print 'time to vect', time()-t

    t = time()
    model = LR(C=8.)
    model.fit(trX, trY)
    print 'time to model', time()-t

    tr_pred = model.predict(trX)
    te_pred = model.predict(teX)

    print metrics.accuracy_score(trY, tr_pred)
    print metrics.accuracy_score(teY, te_pred)