In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [10]:
#read data in
df = pd.read_csv('data_cars.csv',header=None)
for i in range(len(df.columns)):
    df[i] = df[i].astype('category')
df.head()


Out[10]:
0 1 2 3 4 5 6
0 vhigh vhigh 2 2 small low unacc
1 vhigh vhigh 2 2 small med unacc
2 vhigh vhigh 2 2 small high unacc
3 vhigh vhigh 2 2 med low unacc
4 vhigh vhigh 2 2 med med unacc

In [14]:
#map catgories to values
map0 = dict( zip( df[0].cat.categories, range( len(df[0].cat.categories ))))
#print map0
map1 = dict( zip( df[1].cat.categories, range( len(df[1].cat.categories ))))
map2 = dict( zip( df[2].cat.categories, range( len(df[2].cat.categories ))))
map3 = dict( zip( df[3].cat.categories, range( len(df[3].cat.categories ))))
map4 = dict( zip( df[4].cat.categories, range( len(df[4].cat.categories ))))
map5 = dict( zip( df[5].cat.categories, range( len(df[5].cat.categories ))))
map6 = dict( zip( df[6].cat.categories, range( len(df[6].cat.categories ))))

cat_cols = df.select_dtypes(['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

df = df.iloc[np.random.permutation(len(df))]
print df.head()


      0  1  2  3  4  5  6
570   0  0  1  0  1  1  2
951   2  3  3  0  0  1  2
1633  1  1  0  1  1  2  0
412   3  1  3  0  0  2  2
156   3  0  1  2  1  1  2

In [40]:
df_f1 = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_precision = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_recall = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
def CalcMeasures(method,y_pred,y_true,df_f1=df_f1
                 ,df_precision=df_precision,df_recall=df_recall):

    df_f1.loc[len(df_f1)] = [method]+list(f1_score(y_pred,y_true,average=None))
    df_precision.loc[len(df_precision)] = [method]+list(precision_score(y_pred,y_true,average=None))
    df_recall.loc[len(df_recall)] = [method]+list(recall_score(y_pred,y_true,average=None))
    
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values

In [41]:
cv = 10
method = 'linear support vector machine'
clf = svm.SVC(kernel='linear',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'rbf support vector machine'
clf = svm.SVC(kernel='rbf',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'poly support vector machine'
clf = svm.SVC(kernel='poly',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'decision tree'
clf = DecisionTreeClassifier(random_state=0)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'random forest'
clf = RandomForestClassifier(n_estimators=50,random_state=0,max_features=None)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'naive bayes'
clf = MultinomialNB()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'logistic regression'
clf = LogisticRegression()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'k nearest neighbours'
clf = KNeighborsClassifier(weights='distance',n_neighbors=5)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

In [45]:
df_f1


Out[45]:
method acc good unacc vgood
0 linear support vector machine 0.271318 0.000000 0.846757 0.000000
1 rbf support vector machine 0.990921 1.000000 0.997933 0.984375
2 poly support vector machine 0.788918 0.841270 0.938010 0.800000
3 decision tree 0.957309 0.882353 0.989238 0.946565
4 random forest 0.963918 0.915493 0.991275 0.961832
5 naive bayes 0.040404 0.000000 0.825701 0.000000
6 logistic regression 0.265781 0.000000 0.820967 0.078947
7 k nearest neighbours 0.801609 0.534653 0.952988 0.666667

In [46]:
df_precision


Out[46]:
method acc good unacc vgood
0 linear support vector machine 0.182292 0.000000 0.981818 0.000000
1 rbf support vector machine 0.994792 1.000000 0.997521 0.969231
2 poly support vector machine 0.778646 0.768116 0.950413 0.738462
3 decision tree 0.963542 0.869565 0.987603 0.953846
4 random forest 0.973958 0.942029 0.985950 0.969231
5 naive bayes 0.020833 0.000000 0.998347 0.000000
6 logistic regression 0.208333 0.000000 0.919008 0.046154
7 k nearest neighbours 0.778646 0.391304 0.988430 0.507692

In [47]:
df_recall


Out[47]:
method acc good unacc vgood
0 linear support vector machine 0.530303 0.000000 0.744361 0.000000
1 rbf support vector machine 0.987080 1.000000 0.998346 1.000000
2 poly support vector machine 0.799465 0.929825 0.925926 0.872727
3 decision tree 0.951157 0.895522 0.990879 0.939394
4 random forest 0.954082 0.890411 0.996658 0.954545
5 naive bayes 0.666667 0.000000 0.703963 0.000000
6 logistic regression 0.366972 0.000000 0.741828 0.272727
7 k nearest neighbours 0.825967 0.843750 0.920000 0.970588

In [42]:
labels_counts=df[6].value_counts()
pd.Series(map6).map(labels_counts)


Out[42]:
acc       384
good       69
unacc    1210
vgood      65
dtype: int64

In [ ]: