In [2]:
import numpy as np
import random
from sklearn import metrics
from sklearn import datasets
from sklearn import tree
In [3]:
iris = datasets.load_iris()
# iris['data']
In [4]:
x = iris.data[:,2:]
y = iris.target
In [5]:
score_list = []
#shuffle data
linked_iris_data = list(zip(x, y))
random.shuffle(linked_iris_data)
x, y = zip(*linked_iris_data)
#split data into 5 parts
k = 5
subset_size = (len(linked_iris_data)/k) - 1
for i in range(0,k):
x_test = x[int(subset_size * i): int(subset_size * (i+1))]
y_test = y[int(subset_size * i): int(subset_size * (i+1))]
#hold out a different portion as training data
x_train = x[:int(subset_size * i)] + x[int(subset_size * (i+1)):]
y_train = y[:int(subset_size * i)] + y[int(subset_size * (i+1)):]
#fit on the train data
dt = tree.DecisionTreeClassifier().fit(x_train,y_train)
#run on test data
y_pred = dt.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)
#return score, append it to list
score_list.append(score)
In [6]:
score_list
Out[6]:
In [7]:
average_score = sum(score_list)/len(score_list)
average_score
Out[7]:
In [ ]:
In [29]:
from sklearn.cross_validation import cross_val_score
In [30]:
scores = cross_val_score(dt,x,y,cv=5)
In [31]:
np.mean(scores)
Out[31]:
In [ ]:
# results are slightly different -- I guess that's due to differently shuffeling that sorts the data differently.