In [2]:
import numpy as np
import random
from sklearn import metrics
from sklearn import datasets
from sklearn import tree

In [3]:
iris = datasets.load_iris()
# iris['data']

In [4]:
x = iris.data[:,2:]
y = iris.target

In [5]:
score_list = []

#shuffle data
linked_iris_data = list(zip(x, y))
random.shuffle(linked_iris_data)
x, y = zip(*linked_iris_data)

#split data into 5 parts
k = 5
subset_size = (len(linked_iris_data)/k) - 1

for i in range(0,k):

    x_test = x[int(subset_size * i): int(subset_size * (i+1))]
    y_test = y[int(subset_size * i): int(subset_size * (i+1))]

    #hold out a different portion as training data
    x_train = x[:int(subset_size * i)] + x[int(subset_size * (i+1)):]
    y_train = y[:int(subset_size * i)] + y[int(subset_size * (i+1)):]

    #fit on the train data
    dt = tree.DecisionTreeClassifier().fit(x_train,y_train)

    #run on test data
    y_pred = dt.predict(x_test)
    score = metrics.accuracy_score(y_test, y_pred)

    #return score, append it to list
    score_list.append(score)

In [6]:
score_list


Out[6]:
[1.0,
 0.93103448275862066,
 0.96551724137931039,
 0.96551724137931039,
 0.93103448275862066]

In [7]:
average_score = sum(score_list)/len(score_list)
average_score


Out[7]:
0.95862068965517244

In [ ]:

Testing my code against sklearn


In [29]:
from sklearn.cross_validation import cross_val_score

In [30]:
scores = cross_val_score(dt,x,y,cv=5)

In [31]:
np.mean(scores)


Out[31]:
0.94666666666666666

In [ ]:
# results are slightly different -- I guess that's due to differently shuffeling that sorts the data differently.