In [1]:
import seldon.pipeline.auto_transforms as auto
import pandas as pd
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
                 header=None,names=["target","Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium","Total phenols","Flavanoids",
                         "Nonflavanoid phenols","Proanthocyanins","Color intensity","Hue","OD280/OD315 of diluted wines",
                         "Proline"])

In [2]:
df.head()


Out[2]:
target Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

Create an auto transform to scale numeric columns automatically.


In [3]:
df["target"] = df["target"] - 1
t_auto = auto.Auto_transform(exclude=["target"])
df2 = t_auto.fit_transform(df)
df2.head()


Out[3]:
target Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 0 1.518613 -0.562250 0.232053 -1.169593 1.913905 0.808997 1.034819 -0.659563 1.224884 0.251717 0.362177 1.847920 1.013009
1 0 0.246290 -0.499413 -0.827996 -2.490847 0.018145 0.568648 0.733629 -0.820719 -0.544721 -0.293321 0.406051 1.113449 0.965242
2 0 0.196879 0.021231 1.109334 -0.268738 0.088358 0.808997 1.215533 -0.498407 2.135968 0.269020 0.318304 0.788587 1.395148
3 0 1.691550 -0.346811 0.487926 -0.809251 0.930918 2.491446 1.466525 -0.981875 1.032155 1.186068 -0.427544 1.184071 2.334574
4 0 0.295700 0.227694 1.840403 0.451946 1.281985 0.808997 0.663351 0.226796 0.401404 -0.319276 0.362177 0.449601 -0.037874

Create an XGBoost classifier and run 5-fold cross validation on the data.


In [4]:
from seldon import xgb
import seldon.pipeline.cross_validation as cf
xgb = xgb.XGBoostClassifier(target="target")
cv = cf.Seldon_KFold(xgb,5)
cv.fit(df2)


/home/clive/tools/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Out[4]:
Seldon_KFold(clf=XGBoostClassifier(base_score=0.5,
         clf=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_al...torizer=DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True)),
       k=5)

In [5]:
print "Average accuracy ",cv.get_score()


Average accuracy  0.96619047619

In [ ]: