Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into
Don't forget to shuffle the input before assigning to the splits
You can use the fit
Test the results with the sklearn cross_val_score
In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation



In [6]:

    
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
from random import shuffle
from sklearn.metrics import accuracy_score



In [2]:

    
iris = datasets.load_iris() # load iris data set



In [3]:

    
iris.keys()









    Out[3]:





dict_keys(['feature_names', 'target_names', 'DESCR', 'target', 'data'])



In [4]:

    
iris['target_names']









    Out[4]:





array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')



In [5]:

    
iris['target']









    Out[5]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [6]:

    
iris['data']









    Out[6]:





array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.8,  3.1,  1.6,  0.2],
       [ 5.4,  3.4,  1.5,  0.4],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 5.5,  4.2,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5. ,  3.2,  1.2,  0.2],
       [ 5.5,  3.5,  1.3,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 4.4,  3. ,  1.3,  0.2],
       [ 5.1,  3.4,  1.5,  0.2],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 4.5,  2.3,  1.3,  0.3],
       [ 4.4,  3.2,  1.3,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 5.1,  3.8,  1.9,  0.4],
       [ 4.8,  3. ,  1.4,  0.3],
       [ 5.1,  3.8,  1.6,  0.2],
       [ 4.6,  3.2,  1.4,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.3,  1.4,  0.2],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.4,  3.2,  4.5,  1.5],
       [ 6.9,  3.1,  4.9,  1.5],
       [ 5.5,  2.3,  4. ,  1.3],
       [ 6.5,  2.8,  4.6,  1.5],
       [ 5.7,  2.8,  4.5,  1.3],
       [ 6.3,  3.3,  4.7,  1.6],
       [ 4.9,  2.4,  3.3,  1. ],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5. ,  2. ,  3.5,  1. ],
       [ 5.9,  3. ,  4.2,  1.5],
       [ 6. ,  2.2,  4. ,  1. ],
       [ 6.1,  2.9,  4.7,  1.4],
       [ 5.6,  2.9,  3.6,  1.3],
       [ 6.7,  3.1,  4.4,  1.4],
       [ 5.6,  3. ,  4.5,  1.5],
       [ 5.8,  2.7,  4.1,  1. ],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 5.6,  2.5,  3.9,  1.1],
       [ 5.9,  3.2,  4.8,  1.8],
       [ 6.1,  2.8,  4. ,  1.3],
       [ 6.3,  2.5,  4.9,  1.5],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6.4,  2.9,  4.3,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6.8,  2.8,  4.8,  1.4],
       [ 6.7,  3. ,  5. ,  1.7],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 5.7,  2.6,  3.5,  1. ],
       [ 5.5,  2.4,  3.8,  1.1],
       [ 5.5,  2.4,  3.7,  1. ],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5.4,  3. ,  4.5,  1.5],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 6.7,  3.1,  4.7,  1.5],
       [ 6.3,  2.3,  4.4,  1.3],
       [ 5.6,  3. ,  4.1,  1.3],
       [ 5.5,  2.5,  4. ,  1.3],
       [ 5.5,  2.6,  4.4,  1.2],
       [ 6.1,  3. ,  4.6,  1.4],
       [ 5.8,  2.6,  4. ,  1.2],
       [ 5. ,  2.3,  3.3,  1. ],
       [ 5.6,  2.7,  4.2,  1.3],
       [ 5.7,  3. ,  4.2,  1.2],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.2,  2.9,  4.3,  1.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.3,  3.3,  6. ,  2.5],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 7.1,  3. ,  5.9,  2.1],
       [ 6.3,  2.9,  5.6,  1.8],
       [ 6.5,  3. ,  5.8,  2.2],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 4.9,  2.5,  4.5,  1.7],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.7,  2.5,  5.8,  1.8],
       [ 7.2,  3.6,  6.1,  2.5],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 6.4,  2.7,  5.3,  1.9],
       [ 6.8,  3. ,  5.5,  2.1],
       [ 5.7,  2.5,  5. ,  2. ],
       [ 5.8,  2.8,  5.1,  2.4],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 7.7,  3.8,  6.7,  2.2],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.9,  3.2,  5.7,  2.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 7.7,  2.8,  6.7,  2. ],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.7,  3.3,  5.7,  2.1],
       [ 7.2,  3.2,  6. ,  1.8],
       [ 6.2,  2.8,  4.8,  1.8],
       [ 6.1,  3. ,  4.9,  1.8],
       [ 6.4,  2.8,  5.6,  2.1],
       [ 7.2,  3. ,  5.8,  1.6],
       [ 7.4,  2.8,  6.1,  1.9],
       [ 7.9,  3.8,  6.4,  2. ],
       [ 6.4,  2.8,  5.6,  2.2],
       [ 6.3,  2.8,  5.1,  1.5],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.7,  3. ,  6.1,  2.3],
       [ 6.3,  3.4,  5.6,  2.4],
       [ 6.4,  3.1,  5.5,  1.8],
       [ 6. ,  3. ,  4.8,  1.8],
       [ 6.9,  3.1,  5.4,  2.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.9,  3.1,  5.1,  2.3],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 6.8,  3.2,  5.9,  2.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6.7,  3. ,  5.2,  2.3],
       [ 6.3,  2.5,  5. ,  1.9],
       [ 6.5,  3. ,  5.2,  2. ],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.9,  3. ,  5.1,  1.8]])



In [7]:

    
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable



In [8]:

    
for a, b in zip(x, y):
    print(a, b)









    



[ 1.4  0.2] 0
[ 1.4  0.2] 0
[ 1.3  0.2] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.7  0.4] 0
[ 1.4  0.3] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.5  0.1] 0
[ 1.5  0.2] 0
[ 1.6  0.2] 0
[ 1.4  0.1] 0
[ 1.1  0.1] 0
[ 1.2  0.2] 0
[ 1.5  0.4] 0
[ 1.3  0.4] 0
[ 1.4  0.3] 0
[ 1.7  0.3] 0
[ 1.5  0.3] 0
[ 1.7  0.2] 0
[ 1.5  0.4] 0
[ 1.   0.2] 0
[ 1.7  0.5] 0
[ 1.9  0.2] 0
[ 1.6  0.2] 0
[ 1.6  0.4] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.6  0.2] 0
[ 1.6  0.2] 0
[ 1.5  0.4] 0
[ 1.5  0.1] 0
[ 1.4  0.2] 0
[ 1.5  0.1] 0
[ 1.2  0.2] 0
[ 1.3  0.2] 0
[ 1.5  0.1] 0
[ 1.3  0.2] 0
[ 1.5  0.2] 0
[ 1.3  0.3] 0
[ 1.3  0.3] 0
[ 1.3  0.2] 0
[ 1.6  0.6] 0
[ 1.9  0.4] 0
[ 1.4  0.3] 0
[ 1.6  0.2] 0
[ 1.4  0.2] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 4.7  1.4] 1
[ 4.5  1.5] 1
[ 4.9  1.5] 1
[ 4.   1.3] 1
[ 4.6  1.5] 1
[ 4.5  1.3] 1
[ 4.7  1.6] 1
[ 3.3  1. ] 1
[ 4.6  1.3] 1
[ 3.9  1.4] 1
[ 3.5  1. ] 1
[ 4.2  1.5] 1
[ 4.  1.] 1
[ 4.7  1.4] 1
[ 3.6  1.3] 1
[ 4.4  1.4] 1
[ 4.5  1.5] 1
[ 4.1  1. ] 1
[ 4.5  1.5] 1
[ 3.9  1.1] 1
[ 4.8  1.8] 1
[ 4.   1.3] 1
[ 4.9  1.5] 1
[ 4.7  1.2] 1
[ 4.3  1.3] 1
[ 4.4  1.4] 1
[ 4.8  1.4] 1
[ 5.   1.7] 1
[ 4.5  1.5] 1
[ 3.5  1. ] 1
[ 3.8  1.1] 1
[ 3.7  1. ] 1
[ 3.9  1.2] 1
[ 5.1  1.6] 1
[ 4.5  1.5] 1
[ 4.5  1.6] 1
[ 4.7  1.5] 1
[ 4.4  1.3] 1
[ 4.1  1.3] 1
[ 4.   1.3] 1
[ 4.4  1.2] 1
[ 4.6  1.4] 1
[ 4.   1.2] 1
[ 3.3  1. ] 1
[ 4.2  1.3] 1
[ 4.2  1.2] 1
[ 4.2  1.3] 1
[ 4.3  1.3] 1
[ 3.   1.1] 1
[ 4.1  1.3] 1
[ 6.   2.5] 2
[ 5.1  1.9] 2
[ 5.9  2.1] 2
[ 5.6  1.8] 2
[ 5.8  2.2] 2
[ 6.6  2.1] 2
[ 4.5  1.7] 2
[ 6.3  1.8] 2
[ 5.8  1.8] 2
[ 6.1  2.5] 2
[ 5.1  2. ] 2
[ 5.3  1.9] 2
[ 5.5  2.1] 2
[ 5.  2.] 2
[ 5.1  2.4] 2
[ 5.3  2.3] 2
[ 5.5  1.8] 2
[ 6.7  2.2] 2
[ 6.9  2.3] 2
[ 5.   1.5] 2
[ 5.7  2.3] 2
[ 4.9  2. ] 2
[ 6.7  2. ] 2
[ 4.9  1.8] 2
[ 5.7  2.1] 2
[ 6.   1.8] 2
[ 4.8  1.8] 2
[ 4.9  1.8] 2
[ 5.6  2.1] 2
[ 5.8  1.6] 2
[ 6.1  1.9] 2
[ 6.4  2. ] 2
[ 5.6  2.2] 2
[ 5.1  1.5] 2
[ 5.6  1.4] 2
[ 6.1  2.3] 2
[ 5.6  2.4] 2
[ 5.5  1.8] 2
[ 4.8  1.8] 2
[ 5.4  2.1] 2
[ 5.6  2.4] 2
[ 5.1  2.3] 2
[ 5.1  1.9] 2
[ 5.9  2.3] 2
[ 5.7  2.5] 2
[ 5.2  2.3] 2
[ 5.   1.9] 2
[ 5.2  2. ] 2
[ 5.4  2.3] 2
[ 5.1  1.8] 2



In [9]:

    
y









    Out[9]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [17]:

    
# shuffling data (which is X), and target (which is Y) and adding into two seperate lists
shuf_x = []
shuf_y = []
shuf_index = list(range(len(x)))
shuffle(shuf_index)
for i in shuf_index:
    shuf_x.append(x[i])
    shuf_y.append(y[i])



In [18]:

    
chunk_length = int(len(shuf_x)/ 5)
chunk_length









    Out[18]:





30



In [19]:

    
chunk_length = int(len(shuf_y)/ 5)
chunk_length









    Out[19]:





30



In [20]:

    
def chunks(l, num):
    num = max(1, num)
    return [l[i:i + num] for i in range(0, len(l), num)]



In [21]:

    
chunk_y = chunks(shuf_y, chunk_length)



In [22]:

    
chunk_x = chunks(shuf_x, chunk_length)



In [23]:

    
dt = tree.DecisionTreeClassifier()



In [30]:

    
Average_list = []

for x, y in zip(chunk_x, chunk_y):
     
    #Popping first item off the list
    x_test = chunk_x.pop(0)
    x_train = sum(chunk_x, [])
   
    #Adding it back on again
    chunk_x.append(x_test)
   
    #Popping first item off the list
    y_test = chunk_y.pop(0)
    y_train = sum(chunk_y, [])
   
    #Popping it back on again
    chunk_y.append(y_test)
   
    #fitting training
    dt = dt.fit(x_train,y_train)
   
    #Predicting
    y_pred=dt.predict(x_test)
   
    #Getting the accurancy score
Accuracy_score = accuracy_score(y_test, y_pred)
   
    #Creating a list of averages:
Average_list.append(Accuracy_score)



In [31]:

    
print(Average_list)









    



[0.93333333333333335]

Now we create our cross validation scores



In [2]:

    
from sklearn.cross_validation import cross_val_score



In [7]:

    
iris = datasets.load_iris()



In [8]:

    
x = iris.data[:,2:] 
y = iris.target



In [9]:

    
dt = tree.DecisionTreeClassifier()



In [10]:

    
dt = dt.fit(x,y)



In [11]:

    
# http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
scores = cross_val_score(dt,x,y,cv=5) #We're passing in our values and getting an array of values back

and dt is pass the decision tree classifier



In [12]:

    
scores









    Out[12]:





array([ 0.96666667,  0.96666667,  0.9       ,  0.93333333,  1.        ])



In [13]:

    
import numpy as np



In [14]:

    
np.mean(scores) #here we get our average result









    Out[14]:





0.95333333333333337



In [ ]: