In [1]:
import re

from sklearn.datasets import fetch_20newsgroups
from scipy.stats import pearsonr
from datetime import datetime

from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [2]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
documents = dataset['data']  # is a list of documents

In [4]:
texts = []
for document in documents:
    # lower case all words
    lowered = document.lower()
    # remove punctuation and split into seperate words
    words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE)
    texts.append(words)

In [5]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [6]:
print len(documents)
print dictionary

18846
Dictionary(134435 unique tokens: [u'3ds2scn', u'diagnositic', u'9l2t', u'l1tbk', u'porkification']...)


In [10]:
topics = []  # list of 100 topics
for l in open('/home/devashish/datasets/20NG/topics20NG.txt'):
    topics.append([l.split()])

In [8]:
human_scores = []
for l in open('/home/devashish/datasets/20NG/gold20NG.txt'):
    human_scores.append(float(l.strip()))

In [9]:
start = datetime.now()
u_mass = []
flags = []
for n, topic in enumerate(topics):
    try:
        cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        u_mass.append(cm.get_coherence())
    except KeyError:
        flags.append(n)
end = datetime.now()
print "Time taken: %s" % (end - start)

Time taken: 0:04:10.235838


In [13]:
start = datetime.now()
c_v = []
for n, topic in enumerate(topics):
    try:
        cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
    except KeyError:
        pass
end = datetime.now()
print "Time taken: %s" % (end - start)

Time taken: 0:50:03.377507


In [14]:
final_scores = []
for n, score in enumerate(human_scores):
    if n not in flags:
        final_scores.append(score)

In [15]:
print len(u_mass), len(c_v), len(final_scores)
# 3 topics have words that are not in the dictionary.This is due to a difference
# in preprocessing or because of the absence of ~900 documents

93 93 93


In [16]:
print pearsonr(u_mass, final_scores)[0]
print pearsonr(c_v, final_scores)[0]

0.554915406168
0.616997328993
