In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
In [10]:
#read data in
df = pd.read_csv('data_cars.csv',header=None)
for i in range(len(df.columns)):
df[i] = df[i].astype('category')
df.head()
Out[10]:
In [14]:
#map catgories to values
map0 = dict( zip( df[0].cat.categories, range( len(df[0].cat.categories ))))
#print map0
map1 = dict( zip( df[1].cat.categories, range( len(df[1].cat.categories ))))
map2 = dict( zip( df[2].cat.categories, range( len(df[2].cat.categories ))))
map3 = dict( zip( df[3].cat.categories, range( len(df[3].cat.categories ))))
map4 = dict( zip( df[4].cat.categories, range( len(df[4].cat.categories ))))
map5 = dict( zip( df[5].cat.categories, range( len(df[5].cat.categories ))))
map6 = dict( zip( df[6].cat.categories, range( len(df[6].cat.categories ))))
cat_cols = df.select_dtypes(['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)
df = df.iloc[np.random.permutation(len(df))]
print df.head()
In [40]:
df_f1 = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_precision = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_recall = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
def CalcMeasures(method,y_pred,y_true,df_f1=df_f1
,df_precision=df_precision,df_recall=df_recall):
df_f1.loc[len(df_f1)] = [method]+list(f1_score(y_pred,y_true,average=None))
df_precision.loc[len(df_precision)] = [method]+list(precision_score(y_pred,y_true,average=None))
df_recall.loc[len(df_recall)] = [method]+list(recall_score(y_pred,y_true,average=None))
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values
In [41]:
cv = 10
method = 'linear support vector machine'
clf = svm.SVC(kernel='linear',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'rbf support vector machine'
clf = svm.SVC(kernel='rbf',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'poly support vector machine'
clf = svm.SVC(kernel='poly',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'decision tree'
clf = DecisionTreeClassifier(random_state=0)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'random forest'
clf = RandomForestClassifier(n_estimators=50,random_state=0,max_features=None)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'naive bayes'
clf = MultinomialNB()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'logistic regression'
clf = LogisticRegression()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
method = 'k nearest neighbours'
clf = KNeighborsClassifier(weights='distance',n_neighbors=5)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)
In [45]:
df_f1
Out[45]:
In [46]:
df_precision
Out[46]:
In [47]:
df_recall
Out[47]:
In [42]:
labels_counts=df[6].value_counts()
pd.Series(map6).map(labels_counts)
Out[42]:
In [ ]: