import csv import numpy as np import scipy as scipy import re from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem import LancasterStemmer,SnowballStemmer from nltk.stem.snowball import EnglishStemmer from nltk import word_tokenize from sklearn import preprocessing,metrics,cross_validation from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import SelectPercentile, SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression, Ridge class SnowballTokenizer(object): def __init__(self): self.sstem = EnglishStemmer(ignore_stopwords=False) def __call__(self, doc): return [self.sstem.stem(t) for t in word_tokenize(doc)] class LTokenizer10(object): def __init__(self): self.lstem = LancasterStemmer() def __call__(self, doc): temp = word_tokenize(doc) if len(temp)<10: limit = len(temp) else: limit = 10 return [self.lstem.stem(t) for t in temp[0:limit]] class LTokenizer15(object): def __init__(self): self.lstem = LancasterStemmer() def __call__(self, doc): temp = word_tokenize(doc) if len(temp)<15: limit = len(temp) else: limit = 15 return [self.lstem.stem(t) for t in temp[0:limit]] def main(): trainlabels, testlabels, words, chars, URL, first10, first15 = GetLabelsAndText() trainwords, testwords = words[:len(trainlabels)], words[len(trainlabels):] trainchars, testchars = chars[:len(trainlabels)], chars[len(trainlabels):] trainURL, testURL = URL[:len(trainlabels)], URL[len(trainlabels):] trainfirst10, testfirst10 = first10[:len(trainlabels)], first10[len(trainlabels):] trainfirst15, testfirst15 = first15[:len(trainlabels)], first15[len(trainlabels):] kf = StratifiedKFold(trainlabels, n_folds=5, indices=True) cvLogWords, testLogWords = ModelLogistic(2.4,kf,trainwords,trainlabels,testwords) cvMultWords, testMultWords = ModelMultinomial(10,0.025,kf,trainwords,trainlabels,testwords) cvLogChars, testLogChars = ModelLogistic(1.45,kf,trainchars,trainlabels,testchars) cvRFChars, testRFChars = ModelRandomForest(200,kf,trainchars,trainlabels,testchars) cvURL, testURL = ModelLogistic(2.0,kf,trainURL,trainlabels,testURL) cv10, test10 = ModelLogistic(0.8,kf,trainfirst10,trainlabels,testfirst10) cv15, test15 = ModelLogistic(0.8,kf,trainfirst15,trainlabels,testfirst15) cvlabels = ModelNone(kf,trainlabels) X = scipy.vstack((cvLogWords, cvMultWords, cvLogChars, cvRFChars, cvURL, cv10, cv15)).T y = cvlabels Xtest = scipy.vstack((testLogWords, testMultWords, testLogChars, testRFChars, testURL, test10, test15)).T model = Ridge(alpha=500) model.fit(X,y) outputs = model.predict(Xtest) final=vstack((testlabels.T.astype(int),outputs.T.astype(float))).T open_file_object = csv.writer(open('Solution.csv', "wb")) open_file_object.writerow(['urlid','label']) for i in final: open_file_object.writerow(i) open_file_object = 1 def GetLabelsAndText(): csv_file_object = csv.reader(open('train.tsv','rb'),delimiter='\t') header = csv_file_object.next() trainlabels=[] text=[] URL = [] for row in csv_file_object: text.append(row[2]) trainlabels.append(row[26]) temp = re.search('http://(.+?)/',row[0]) if temp: URL.append(temp.group(1)) else: URL.append('') trainlabels = np.array(trainlabels).astype(int) csv_file_object = csv.reader(open('test.tsv','rb'),delimiter='\t') header = csv_file_object.next() testlabels=[] for row in csv_file_object: text.append(row[2]) testlabels.append(row[1]) temp = re.search('http://(.+?)/',row[0]) if temp: URL.append(temp.group(1)) else: URL.append('') testlabels = np.array(testlabels) text = np.array(text) URL = np.array(URL) temptext = text.copy() for i in range(text.shape[0]): text[i] = re.sub('\.|\,| +',' ',text[i].lower()) vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='char',ngram_range=(3,3),use_idf=1,smooth_idf=1,sublinear_tf=1) Chars = vect.fit_transform(text) text = temptext for i in range(text.shape[0]): text[i] = re.sub('","url":"|{"title":"|","body":"|"}|\.|\,| +',' ',text[i].lower()) vect = TfidfVectorizer(norm='l2',stop_words='english',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='word',ngram_range=(1,2),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=SnowballTokenizer()) Words = vect.fit_transform(text) vect = TfidfVectorizer(strip_accents = None,tokenizer = None,analyzer='word' ) URL = vect.fit_transform(URL) vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,analyzer='word',ngram_range=(1,1),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LTokenizer10()) First10 = vect.fit_transform(text) vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,analyzer='word',ngram_range=(1,1),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LTokenizer15()) First15 = vect.fit_transform(text) return trainlabels, testlabels, Words, Chars, URL, First10, First15 def ModelNone(kf,y): count = 0 outputs = np.zeros((5,len(y)/5)) for train,cv in kf: outputs[count] = y[cv] count += 1 return np.hstack((outputs)) def ModelLogistic(cValue,kf,X,y,Xtest): count = 0 outputs = np.zeros((5,len(y)/5)) for train,cv in kf: X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv] model = LogisticRegression(C=cValue) model.fit(X_train,y_train) outputs[count] = model.predict_proba(X_cv)[:,1] count += 1 model = LogisticRegression(C=cValue) model.fit(X,y) outputsTest = model.predict_proba(Xtest)[:,1] return np.hstack((outputs)), outputsTest def ModelMultinomial(kValue,alpha,kf,X,y,Xtest): count = 0 outputs = np.zeros((5,len(y)/5)) FS=SelectPercentile(score_func=chi2,percentile=kValue) for train,cv in kf: X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv] X_new = FS.fit_transform(X_train,y_train) model = MultinomialNB(alpha=alpha) model.fit(X_new,y_train) outputs[count] = model.predict_proba(FS.transform(X_cv))[:,1] count += 1 X_new = FS.fit_transform(X,y) model = MultinomialNB(alpha=alpha) model.fit(X_new,y) outputsTest = model.predict_proba(FS.transform(Xtest))[:,1] return np.hstack((outputs)), outputsTest def ModelRandomForest(kChi,kf,X,y,Xtest): count = 0 outputs = np.zeros((5,len(y)/5)) FS=SelectKBest(score_func=chi2,k=kChi) for train,cv in kf: X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv] X_new = FS.fit_transform(X_train,y_train) model = RandomForestClassifier(n_estimators=1000) model.fit(X_new.todense(),y_train) outputs[count] = model.predict_proba(FS.transform(X_cv.todense()))[:,1] count += 1 X_new = FS.fit_transform(X,y) model = RandomForestClassifier(n_estimators=1000) model.fit(X_new.todense(),y) outputsTest = model.predict_proba(FS.transform(Xtest.todense()))[:,1] return np.hstack((outputs)), outputsTest if __name__ == '__main__': main()