pseudocode

  • In your groups, write pseudocode for a 5-fold CV function
  • Take in input data (np.array) and model
  • Split into training and test across 5 iterations
  • Use the model functions to fit, predict, and score
  • Output an estimate of the model accuracy

In [18]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import cross_val_score
from random import shuffle
import random
from sklearn import metrics

In [4]:
iris = datasets.load_iris()

In [5]:
x = iris.data[:,2:] 
y = iris.target

In [6]:
dt = tree.DecisionTreeClassifier()

In [7]:
dt = dt.fit(x,y)

In [21]:
scores = []

data = list(zip(x, y)) #randomnize the order of data(shuffle the index number)
random.shuffle(data)
x, y = zip(*data)

# #split data into 5 parts
# k = 5
# subset_size = (len(linked_iris_data)/k) - 1


def cross_validation():
    leng = len(data)
    div = leng/5 -1 #     divide the data by 5 parts
    print(div)
    for i in range(0,5):
        x_test = x[int(div * i): int(div * (i+1))]
        y_test = y[int(div * i): int(div * (i+1))]
        x_train = x[:int(div * i)] + x[int(div * (i+1)):]
        y_train = y[:int(div * i)] + y[int(div * (i+1)):]
        
        dt = tree.DecisionTreeClassifier().fit(x_train,y_train)
        
        # test
        y_pred = dt.predict(x_test)
        score = metrics.accuracy_score(y_test, y_pred)
        
        scores.append(score)

In [22]:
cross_validation()


29.0

In [23]:
scores


Out[23]:
[0.93103448275862066, 0.93103448275862066, 0.86206896551724133, 1.0, 1.0]

In [26]:
mean = sum(scores)/len(scores)
mean


Out[26]:
0.94482758620689644

testing


In [27]:
from sklearn.cross_validation import cross_val_score

In [28]:
sc = cross_val_score(dt,x,y,cv=5)

In [29]:
np.mean(sc)


Out[29]:
0.95333333333333337

In [ ]: