In [1]:
from sklearn import cross_validation

In [2]:
from sklearn import datasets

In [3]:
iris = datasets.load_iris()

In [21]:
data_train, data_test, target_train, target_test = cross_validation.train_test_split(iris.data[:, :2], 
                                                                                     iris.target, test_size=0.3333, random_state = 0)

In [14]:
from sklearn import tree

In [23]:
clf = tree.DecisionTreeClassifier()

In [24]:
clf = clf.fit(data_train, target_train)

In [25]:
pred = clf.predict(data_test)

In [26]:
1.0*sum(pred!=target_test)/len(target_test)


Out[26]:
0.32000000000000001

In [27]:
clf.score(data_test, target_test)


Out[27]:
0.68000000000000005

In [28]:
scores = cross_validation.cross_val_score(clf, data_train, target_train, cv=5)

In [29]:
print scores


[ 0.8   0.7   0.7   0.5   0.75]

In [30]:
print scores.mean()


0.69

In [48]:
bestPruning = -1
bestAccuracy = 0
for i in range(2, len(target_train)):
    clf = tree.DecisionTreeClassifier()
    scores = cross_validation.cross_val_score(clf, data_train, target_train, cv=i)
    if scores.mean() > bestAccuracy:
        bestAccuracy = scores.mean()
        bestPruning = i

In [49]:
print bestAccuracy
print bestPruning


0.789661319073
3

In [47]:


In [ ]: