import re import os from scipy.stats import pearsonr from datetime import datetime from gensim.models import CoherenceModel from gensim.corpora.dictionary import Dictionary prefix = "/home/devashish/datasets/Movies/movie" start = datetime.now() texts = [] for fil in os.listdir(prefix): for line in open(prefix + '/' + fil): # lower case all words lowered = line.lower() #remove punctuation and split into seperate words words = re.findall(r'\w+', lowered, flags = re.UNICODE | re.LOCALE) texts.append(words) end = datetime.now() print "Time taken: %s" % (end - start) start = datetime.now() dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] end = datetime.now() print "Time taken: %s" % (end - start) print len(corpus) print dictionary topics = [] # list of 100 topics for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'): topics.append([l.split()]) topics.pop(100) human_scores = [] for l in open('/home/devashish/datasets/Movies/goldMovie.txt'): human_scores.append(float(l.strip())) start = datetime.now() c_v = [] for n, topic in enumerate(topics[:1]): print n # for personal monitoring purposes. sorry for this try: cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v') c_v.append(cm.get_coherence()) except KeyError: raise pass end = datetime.now() print "Time taken: %s" % (end - start) print c_v