Created
November 20, 2012 05:16
-
-
Save rishabhjain/4116185 to your computer and use it in GitHub Desktop.
Ensembleutility
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| import numpy as np | |
| from functools import partial | |
| from itertools import product | |
| from collections import Counter,defaultdict | |
| from sklearn import svm | |
| from sklearn import linear_model | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.decomposition import PCA | |
| from sklearn import neighbors | |
| from operator import itemgetter | |
| from ipdb import set_trace as l | |
| def pca(train,test): | |
| trans = PCA('mle').fit(train) | |
| train = trans.transform(train.astype(np.float)) | |
| test = trans.transform(test.astype(np.float)) | |
| return train,test | |
| def knn(train,target,test): | |
| clf = neighbors.KNeighborsClassifier(n_neighbors=9).fit(train,target) | |
| return clf.predict(test) | |
| def linearsvm(train,target,test): | |
| clf = svm.SVC(kernel = 'linear').fit(train,target) | |
| return clf.predict(test) | |
| def nonlinearsvm(train,target,test): | |
| clf = svm.NuSVC().fit(train,target) | |
| return clf.predict(test) | |
| def genericrbfsvm(train,target,test,x,y): | |
| clf = svm.SVC(kernel='rbf',C=x,gamma=y).fit(train,target) | |
| return clf.predict(test) | |
| def rbfsvm(train,target,test): | |
| allclfs = [partial(genericrbfsvm,x=i[0],y=i[1]) for i in product([10**(i) for i in np.arange(-4,5)], np.linspace(0.01,5,10))] | |
| z = score(train,target,clfs=allclfs,best = True) | |
| maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]] | |
| x,y = maxcorrect.keywords['x'], maxcorrect.keywords['y'] | |
| return genericrbfsvm(train,target,test,x=x,y=y) | |
| def genericlogreg(train,target,test,x): | |
| clf = linear_model.LogisticRegression(C=x).fit(train,target) | |
| return clf.predict(test) | |
| def logreg(train,target,test): | |
| allclfs = [partial(genericlogreg,x=i) for i in [10**(j) for j in np.arange(-4,5)]] | |
| z = score(train,target,clfs=allclfs,best=True) | |
| maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]] | |
| return genericlogreg(train,target,test,x=maxcorrect.keywords['x']) | |
| def genericrandomforest(train,target,test,x): | |
| clf = RandomForestClassifier(n_estimators=x).fit(train,target) | |
| return clf.predict(test) | |
| def randomforest(train,target,test): | |
| allclfs = [partial(genericrandomforest,x=i) for i in range(10,221,30)] | |
| z = score(train,target,clfs=allclfs,best=True) | |
| maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]] | |
| return genericrandomforest(train,target,test,x=maxcorrect.keywords['x']) | |
| # allclfs = [logreg,randomforest,rbfsvm,linearsvm,nonlinearsvm] | |
| allclfs = [logreg,randomforest,rbfsvm] | |
| def normalize(x): | |
| x = x.astype(np.float) | |
| x/=np.max(x,axis=0) | |
| return x | |
| def average(train,target,test,all=False,runpca=False,norm=True,clfs=allclfs): | |
| ''' | |
| Takes output labels of all models present in "clfs" list and gives | |
| most common labels to testdata. | |
| ''' | |
| allvalues = [i(train,target,test) for i in clfs] | |
| if runpca: | |
| pcatrain,pcatest = pca(train,test) | |
| allvalues += [i(pcatrain,target,pcatest) for i in clfs] | |
| if norm: | |
| normtrain,normtest = map(normalize,(train,test)) | |
| allvalues += [i(normtrain,target,normtest) for i in clfs] | |
| ivar = [[i[j] for i in allvalues] for j in range(len(allvalues[0]))] | |
| ivar = [[int(i) for i in j] for j in ivar] | |
| final = [Counter(i).most_common(1)[0][0] for i in ivar] | |
| ivar = [k +[final[j]] for j,k in enumerate(ivar)] | |
| if all: return ivar | |
| return final | |
| def shuffle(x,y): | |
| randomindex = np.random.permutation(len(x)) | |
| return x[randomindex],y[randomindex] | |
| def splitwithfactor(train,target,factor = 0.1): | |
| newlen = int(len(train)*(1-factor)) | |
| return train[:newlen],target[:newlen],train[newlen:],target[newlen:] | |
| def score(train,target,clfs=allclfs,best = False): | |
| ''' | |
| Cross Validation and returns accuracy. | |
| ''' | |
| train,target = shuffle(train,target) | |
| result = defaultdict(int) | |
| train,target,test,testsln = splitwithfactor(train,target,0.2) | |
| if not best: | |
| predictions = average(train,target,test,all =True,clfs=clfs) | |
| else: | |
| predictions = average(train,target,test,all =True,clfs=clfs,norm = False,runpca=False) | |
| for i in range(len(predictions)): | |
| for k,j in enumerate(predictions[i]): | |
| if j == int(testsln[i]): | |
| result[k]+=(1/len(test)) | |
| return result | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment