Skip to content

Instantly share code, notes, and snippets.

@rishabhjain
Created November 20, 2012 05:16
Show Gist options
  • Select an option

  • Save rishabhjain/4116185 to your computer and use it in GitHub Desktop.

Select an option

Save rishabhjain/4116185 to your computer and use it in GitHub Desktop.
Ensembleutility
from __future__ import division
import numpy as np
from functools import partial
from itertools import product
from collections import Counter,defaultdict
from sklearn import svm
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import neighbors
from operator import itemgetter
from ipdb import set_trace as l
def pca(train,test):
trans = PCA('mle').fit(train)
train = trans.transform(train.astype(np.float))
test = trans.transform(test.astype(np.float))
return train,test
def knn(train,target,test):
clf = neighbors.KNeighborsClassifier(n_neighbors=9).fit(train,target)
return clf.predict(test)
def linearsvm(train,target,test):
clf = svm.SVC(kernel = 'linear').fit(train,target)
return clf.predict(test)
def nonlinearsvm(train,target,test):
clf = svm.NuSVC().fit(train,target)
return clf.predict(test)
def genericrbfsvm(train,target,test,x,y):
clf = svm.SVC(kernel='rbf',C=x,gamma=y).fit(train,target)
return clf.predict(test)
def rbfsvm(train,target,test):
allclfs = [partial(genericrbfsvm,x=i[0],y=i[1]) for i in product([10**(i) for i in np.arange(-4,5)], np.linspace(0.01,5,10))]
z = score(train,target,clfs=allclfs,best = True)
maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]]
x,y = maxcorrect.keywords['x'], maxcorrect.keywords['y']
return genericrbfsvm(train,target,test,x=x,y=y)
def genericlogreg(train,target,test,x):
clf = linear_model.LogisticRegression(C=x).fit(train,target)
return clf.predict(test)
def logreg(train,target,test):
allclfs = [partial(genericlogreg,x=i) for i in [10**(j) for j in np.arange(-4,5)]]
z = score(train,target,clfs=allclfs,best=True)
maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]]
return genericlogreg(train,target,test,x=maxcorrect.keywords['x'])
def genericrandomforest(train,target,test,x):
clf = RandomForestClassifier(n_estimators=x).fit(train,target)
return clf.predict(test)
def randomforest(train,target,test):
allclfs = [partial(genericrandomforest,x=i) for i in range(10,221,30)]
z = score(train,target,clfs=allclfs,best=True)
maxcorrect = allclfs[max(z.iteritems(),key=itemgetter(1))[0]]
return genericrandomforest(train,target,test,x=maxcorrect.keywords['x'])
# allclfs = [logreg,randomforest,rbfsvm,linearsvm,nonlinearsvm]
allclfs = [logreg,randomforest,rbfsvm]
def normalize(x):
x = x.astype(np.float)
x/=np.max(x,axis=0)
return x
def average(train,target,test,all=False,runpca=False,norm=True,clfs=allclfs):
'''
Takes output labels of all models present in "clfs" list and gives
most common labels to testdata.
'''
allvalues = [i(train,target,test) for i in clfs]
if runpca:
pcatrain,pcatest = pca(train,test)
allvalues += [i(pcatrain,target,pcatest) for i in clfs]
if norm:
normtrain,normtest = map(normalize,(train,test))
allvalues += [i(normtrain,target,normtest) for i in clfs]
ivar = [[i[j] for i in allvalues] for j in range(len(allvalues[0]))]
ivar = [[int(i) for i in j] for j in ivar]
final = [Counter(i).most_common(1)[0][0] for i in ivar]
ivar = [k +[final[j]] for j,k in enumerate(ivar)]
if all: return ivar
return final
def shuffle(x,y):
randomindex = np.random.permutation(len(x))
return x[randomindex],y[randomindex]
def splitwithfactor(train,target,factor = 0.1):
newlen = int(len(train)*(1-factor))
return train[:newlen],target[:newlen],train[newlen:],target[newlen:]
def score(train,target,clfs=allclfs,best = False):
'''
Cross Validation and returns accuracy.
'''
train,target = shuffle(train,target)
result = defaultdict(int)
train,target,test,testsln = splitwithfactor(train,target,0.2)
if not best:
predictions = average(train,target,test,all =True,clfs=clfs)
else:
predictions = average(train,target,test,all =True,clfs=clfs,norm = False,runpca=False)
for i in range(len(predictions)):
for k,j in enumerate(predictions[i]):
if j == int(testsln[i]):
result[k]+=(1/len(test))
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment