In [18]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import cross_val_score
from random import shuffle
import random
from sklearn import metrics
In [4]:
iris = datasets.load_iris()
In [5]:
x = iris.data[:,2:]
y = iris.target
In [6]:
dt = tree.DecisionTreeClassifier()
In [7]:
dt = dt.fit(x,y)
In [21]:
scores = []
data = list(zip(x, y)) #randomnize the order of data(shuffle the index number)
random.shuffle(data)
x, y = zip(*data)
# #split data into 5 parts
# k = 5
# subset_size = (len(linked_iris_data)/k) - 1
def cross_validation():
leng = len(data)
div = leng/5 -1 # divide the data by 5 parts
print(div)
for i in range(0,5):
x_test = x[int(div * i): int(div * (i+1))]
y_test = y[int(div * i): int(div * (i+1))]
x_train = x[:int(div * i)] + x[int(div * (i+1)):]
y_train = y[:int(div * i)] + y[int(div * (i+1)):]
dt = tree.DecisionTreeClassifier().fit(x_train,y_train)
# test
y_pred = dt.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)
scores.append(score)
In [22]:
cross_validation()
In [23]:
scores
Out[23]:
In [26]:
mean = sum(scores)/len(scores)
mean
Out[26]:
In [27]:
from sklearn.cross_validation import cross_val_score
In [28]:
sc = cross_val_score(dt,x,y,cv=5)
In [29]:
np.mean(sc)
Out[29]:
In [ ]: