notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



In [10]:

    
#read data in
df = pd.read_csv('data_cars.csv',header=None)
for i in range(len(df.columns)):
    df[i] = df[i].astype('category')
df.head()









    Out[10]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
    
  
  
    
      0
      vhigh
      vhigh
      2
      2
      small
      low
      unacc
    
    
      1
      vhigh
      vhigh
      2
      2
      small
      med
      unacc
    
    
      2
      vhigh
      vhigh
      2
      2
      small
      high
      unacc
    
    
      3
      vhigh
      vhigh
      2
      2
      med
      low
      unacc
    
    
      4
      vhigh
      vhigh
      2
      2
      med
      med
      unacc



In [14]:

    
#map catgories to values
map0 = dict( zip( df[0].cat.categories, range( len(df[0].cat.categories ))))
#print map0
map1 = dict( zip( df[1].cat.categories, range( len(df[1].cat.categories ))))
map2 = dict( zip( df[2].cat.categories, range( len(df[2].cat.categories ))))
map3 = dict( zip( df[3].cat.categories, range( len(df[3].cat.categories ))))
map4 = dict( zip( df[4].cat.categories, range( len(df[4].cat.categories ))))
map5 = dict( zip( df[5].cat.categories, range( len(df[5].cat.categories ))))
map6 = dict( zip( df[6].cat.categories, range( len(df[6].cat.categories ))))

cat_cols = df.select_dtypes(['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

df = df.iloc[np.random.permutation(len(df))]
print df.head()









    



      0  1  2  3  4  5  6
570   0  0  1  0  1  1  2
951   2  3  3  0  0  1  2
1633  1  1  0  1  1  2  0
412   3  1  3  0  0  2  2
156   3  0  1  2  1  1  2



In [40]:

    
df_f1 = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_precision = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_recall = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
def CalcMeasures(method,y_pred,y_true,df_f1=df_f1
                 ,df_precision=df_precision,df_recall=df_recall):

    df_f1.loc[len(df_f1)] = [method]+list(f1_score(y_pred,y_true,average=None))
    df_precision.loc[len(df_precision)] = [method]+list(precision_score(y_pred,y_true,average=None))
    df_recall.loc[len(df_recall)] = [method]+list(recall_score(y_pred,y_true,average=None))
    
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values



In [41]:

    
cv = 10
method = 'linear support vector machine'
clf = svm.SVC(kernel='linear',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'rbf support vector machine'
clf = svm.SVC(kernel='rbf',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'poly support vector machine'
clf = svm.SVC(kernel='poly',C=50)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'decision tree'
clf = DecisionTreeClassifier(random_state=0)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'random forest'
clf = RandomForestClassifier(n_estimators=50,random_state=0,max_features=None)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'naive bayes'
clf = MultinomialNB()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'logistic regression'
clf = LogisticRegression()
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'k nearest neighbours'
clf = KNeighborsClassifier(weights='distance',n_neighbors=5)
y_pred = cross_validation.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)



In [45]:

    
df_f1









    Out[45]:






  
    
      
      method
      acc
      good
      unacc
      vgood
    
  
  
    
      0
      linear support vector machine
      0.271318
      0.000000
      0.846757
      0.000000
    
    
      1
      rbf support vector machine
      0.990921
      1.000000
      0.997933
      0.984375
    
    
      2
      poly support vector machine
      0.788918
      0.841270
      0.938010
      0.800000
    
    
      3
      decision tree
      0.957309
      0.882353
      0.989238
      0.946565
    
    
      4
      random forest
      0.963918
      0.915493
      0.991275
      0.961832
    
    
      5
      naive bayes
      0.040404
      0.000000
      0.825701
      0.000000
    
    
      6
      logistic regression
      0.265781
      0.000000
      0.820967
      0.078947
    
    
      7
      k nearest neighbours
      0.801609
      0.534653
      0.952988
      0.666667



In [46]:

    
df_precision









    Out[46]:






  
    
      
      method
      acc
      good
      unacc
      vgood
    
  
  
    
      0
      linear support vector machine
      0.182292
      0.000000
      0.981818
      0.000000
    
    
      1
      rbf support vector machine
      0.994792
      1.000000
      0.997521
      0.969231
    
    
      2
      poly support vector machine
      0.778646
      0.768116
      0.950413
      0.738462
    
    
      3
      decision tree
      0.963542
      0.869565
      0.987603
      0.953846
    
    
      4
      random forest
      0.973958
      0.942029
      0.985950
      0.969231
    
    
      5
      naive bayes
      0.020833
      0.000000
      0.998347
      0.000000
    
    
      6
      logistic regression
      0.208333
      0.000000
      0.919008
      0.046154
    
    
      7
      k nearest neighbours
      0.778646
      0.391304
      0.988430
      0.507692



In [47]:

    
df_recall









    Out[47]:






  
    
      
      method
      acc
      good
      unacc
      vgood
    
  
  
    
      0
      linear support vector machine
      0.530303
      0.000000
      0.744361
      0.000000
    
    
      1
      rbf support vector machine
      0.987080
      1.000000
      0.998346
      1.000000
    
    
      2
      poly support vector machine
      0.799465
      0.929825
      0.925926
      0.872727
    
    
      3
      decision tree
      0.951157
      0.895522
      0.990879
      0.939394
    
    
      4
      random forest
      0.954082
      0.890411
      0.996658
      0.954545
    
    
      5
      naive bayes
      0.666667
      0.000000
      0.703963
      0.000000
    
    
      6
      logistic regression
      0.366972
      0.000000
      0.741828
      0.272727
    
    
      7
      k nearest neighbours
      0.825967
      0.843750
      0.920000
      0.970588



In [42]:

    
labels_counts=df[6].value_counts()
pd.Series(map6).map(labels_counts)









    Out[42]:





acc       384
good       69
unacc    1210
vgood      65
dtype: int64



In [ ]:

	0	1	2	3	4	5	6
0	vhigh	vhigh	2	2	small	low	unacc
1	vhigh	vhigh	2	2	small	med	unacc
2	vhigh	vhigh	2	2	small	high	unacc
3	vhigh	vhigh	2	2	med	low	unacc
4	vhigh	vhigh	2	2	med	med	unacc

	method	acc	good	unacc	vgood
0	linear support vector machine	0.271318	0.000000	0.846757	0.000000
1	rbf support vector machine	0.990921	1.000000	0.997933	0.984375
2	poly support vector machine	0.788918	0.841270	0.938010	0.800000
3	decision tree	0.957309	0.882353	0.989238	0.946565
4	random forest	0.963918	0.915493	0.991275	0.961832
5	naive bayes	0.040404	0.000000	0.825701	0.000000
6	logistic regression	0.265781	0.000000	0.820967	0.078947
7	k nearest neighbours	0.801609	0.534653	0.952988	0.666667

	method	acc	good	unacc	vgood
0	linear support vector machine	0.182292	0.000000	0.981818	0.000000
1	rbf support vector machine	0.994792	1.000000	0.997521	0.969231
2	poly support vector machine	0.778646	0.768116	0.950413	0.738462
3	decision tree	0.963542	0.869565	0.987603	0.953846
4	random forest	0.973958	0.942029	0.985950	0.969231
5	naive bayes	0.020833	0.000000	0.998347	0.000000
6	logistic regression	0.208333	0.000000	0.919008	0.046154
7	k nearest neighbours	0.778646	0.391304	0.988430	0.507692

	method	acc	good	unacc	vgood
0	linear support vector machine	0.530303	0.000000	0.744361	0.000000
1	rbf support vector machine	0.987080	1.000000	0.998346	1.000000
2	poly support vector machine	0.799465	0.929825	0.925926	0.872727
3	decision tree	0.951157	0.895522	0.990879	0.939394
4	random forest	0.954082	0.890411	0.996658	0.954545
5	naive bayes	0.666667	0.000000	0.703963	0.000000
6	logistic regression	0.366972	0.000000	0.741828	0.272727
7	k nearest neighbours	0.825967	0.843750	0.920000	0.970588