notebook.community

Edit and run



In [1]:

    
import seldon.pipeline.auto_transforms as auto
import pandas as pd
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
                 header=None,names=["target","Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium","Total phenols","Flavanoids",
                         "Nonflavanoid phenols","Proanthocyanins","Color intensity","Hue","OD280/OD315 of diluted wines",
                         "Proline"])



In [2]:

    
df.head()









    Out[2]:






  
    
      
      target
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      0
      1
      14.23
      1.71
      2.43
      15.6
      127
      2.80
      3.06
      0.28
      2.29
      5.64
      1.04
      3.92
      1065
    
    
      1
      1
      13.20
      1.78
      2.14
      11.2
      100
      2.65
      2.76
      0.26
      1.28
      4.38
      1.05
      3.40
      1050
    
    
      2
      1
      13.16
      2.36
      2.67
      18.6
      101
      2.80
      3.24
      0.30
      2.81
      5.68
      1.03
      3.17
      1185
    
    
      3
      1
      14.37
      1.95
      2.50
      16.8
      113
      3.85
      3.49
      0.24
      2.18
      7.80
      0.86
      3.45
      1480
    
    
      4
      1
      13.24
      2.59
      2.87
      21.0
      118
      2.80
      2.69
      0.39
      1.82
      4.32
      1.04
      2.93
      735

Create an auto transform to scale numeric columns automatically.



In [3]:

    
df["target"] = df["target"] - 1
t_auto = auto.Auto_transform(exclude=["target"])
df2 = t_auto.fit_transform(df)
df2.head()









    Out[3]:






  
    
      
      target
      Alcohol
      Malic acid
      Ash
      Alcalinity of ash
      Magnesium
      Total phenols
      Flavanoids
      Nonflavanoid phenols
      Proanthocyanins
      Color intensity
      Hue
      OD280/OD315 of diluted wines
      Proline
    
  
  
    
      0
      0
      1.518613
      -0.562250
      0.232053
      -1.169593
      1.913905
      0.808997
      1.034819
      -0.659563
      1.224884
      0.251717
      0.362177
      1.847920
      1.013009
    
    
      1
      0
      0.246290
      -0.499413
      -0.827996
      -2.490847
      0.018145
      0.568648
      0.733629
      -0.820719
      -0.544721
      -0.293321
      0.406051
      1.113449
      0.965242
    
    
      2
      0
      0.196879
      0.021231
      1.109334
      -0.268738
      0.088358
      0.808997
      1.215533
      -0.498407
      2.135968
      0.269020
      0.318304
      0.788587
      1.395148
    
    
      3
      0
      1.691550
      -0.346811
      0.487926
      -0.809251
      0.930918
      2.491446
      1.466525
      -0.981875
      1.032155
      1.186068
      -0.427544
      1.184071
      2.334574
    
    
      4
      0
      0.295700
      0.227694
      1.840403
      0.451946
      1.281985
      0.808997
      0.663351
      0.226796
      0.401404
      -0.319276
      0.362177
      0.449601
      -0.037874

Create an XGBoost classifier and run 5-fold cross validation on the data.



In [4]:

    
from seldon import xgb
import seldon.pipeline.cross_validation as cf
xgb = xgb.XGBoostClassifier(target="target")
cv = cf.Seldon_KFold(xgb,5)
cv.fit(df2)









    



/home/clive/tools/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    Out[4]:





Seldon_KFold(clf=XGBoostClassifier(base_score=0.5,
         clf=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_al...torizer=DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True)),
       k=5)



In [5]:

    
print "Average accuracy ",cv.get_score()









    



Average accuracy  0.96619047619



In [ ]:

	target	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1.518613	-0.562250	0.232053	-1.169593	1.913905	0.808997	1.034819	-0.659563	1.224884	0.251717	0.362177	1.847920	1.013009
1	0.246290	-0.499413	-0.827996	-2.490847	0.018145	0.568648	0.733629	-0.820719	-0.544721	-0.293321	0.406051	1.113449	0.965242
2	0.196879	0.021231	1.109334	-0.268738	0.088358	0.808997	1.215533	-0.498407	2.135968	0.269020	0.318304	0.788587	1.395148
3	1.691550	-0.346811	0.487926	-0.809251	0.930918	2.491446	1.466525	-0.981875	1.032155	1.186068	-0.427544	1.184071	2.334574
4	0.295700	0.227694	1.840403	0.451946	1.281985	0.808997	0.663351	0.226796	0.401404	-0.319276	0.362177	0.449601	-0.037874