• Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into
  • Don't forget to shuffle the input before assigning to the splits
  • You can use the fit
  • Test the results with the sklearn cross_val_score
  • In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation

In [6]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
from random import shuffle
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris() # load iris data set

In [3]:
iris.keys()


Out[3]:
dict_keys(['feature_names', 'target_names', 'DESCR', 'target', 'data'])

In [4]:
iris['target_names']


Out[4]:
array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')

In [5]:
iris['target']


Out[5]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
iris['data']


Out[6]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.8,  3.1,  1.6,  0.2],
       [ 5.4,  3.4,  1.5,  0.4],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 5.5,  4.2,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5. ,  3.2,  1.2,  0.2],
       [ 5.5,  3.5,  1.3,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 4.4,  3. ,  1.3,  0.2],
       [ 5.1,  3.4,  1.5,  0.2],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 4.5,  2.3,  1.3,  0.3],
       [ 4.4,  3.2,  1.3,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 5.1,  3.8,  1.9,  0.4],
       [ 4.8,  3. ,  1.4,  0.3],
       [ 5.1,  3.8,  1.6,  0.2],
       [ 4.6,  3.2,  1.4,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.3,  1.4,  0.2],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.4,  3.2,  4.5,  1.5],
       [ 6.9,  3.1,  4.9,  1.5],
       [ 5.5,  2.3,  4. ,  1.3],
       [ 6.5,  2.8,  4.6,  1.5],
       [ 5.7,  2.8,  4.5,  1.3],
       [ 6.3,  3.3,  4.7,  1.6],
       [ 4.9,  2.4,  3.3,  1. ],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5. ,  2. ,  3.5,  1. ],
       [ 5.9,  3. ,  4.2,  1.5],
       [ 6. ,  2.2,  4. ,  1. ],
       [ 6.1,  2.9,  4.7,  1.4],
       [ 5.6,  2.9,  3.6,  1.3],
       [ 6.7,  3.1,  4.4,  1.4],
       [ 5.6,  3. ,  4.5,  1.5],
       [ 5.8,  2.7,  4.1,  1. ],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 5.6,  2.5,  3.9,  1.1],
       [ 5.9,  3.2,  4.8,  1.8],
       [ 6.1,  2.8,  4. ,  1.3],
       [ 6.3,  2.5,  4.9,  1.5],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6.4,  2.9,  4.3,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6.8,  2.8,  4.8,  1.4],
       [ 6.7,  3. ,  5. ,  1.7],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 5.7,  2.6,  3.5,  1. ],
       [ 5.5,  2.4,  3.8,  1.1],
       [ 5.5,  2.4,  3.7,  1. ],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5.4,  3. ,  4.5,  1.5],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 6.7,  3.1,  4.7,  1.5],
       [ 6.3,  2.3,  4.4,  1.3],
       [ 5.6,  3. ,  4.1,  1.3],
       [ 5.5,  2.5,  4. ,  1.3],
       [ 5.5,  2.6,  4.4,  1.2],
       [ 6.1,  3. ,  4.6,  1.4],
       [ 5.8,  2.6,  4. ,  1.2],
       [ 5. ,  2.3,  3.3,  1. ],
       [ 5.6,  2.7,  4.2,  1.3],
       [ 5.7,  3. ,  4.2,  1.2],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.2,  2.9,  4.3,  1.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.3,  3.3,  6. ,  2.5],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 7.1,  3. ,  5.9,  2.1],
       [ 6.3,  2.9,  5.6,  1.8],
       [ 6.5,  3. ,  5.8,  2.2],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 4.9,  2.5,  4.5,  1.7],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.7,  2.5,  5.8,  1.8],
       [ 7.2,  3.6,  6.1,  2.5],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 6.4,  2.7,  5.3,  1.9],
       [ 6.8,  3. ,  5.5,  2.1],
       [ 5.7,  2.5,  5. ,  2. ],
       [ 5.8,  2.8,  5.1,  2.4],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 7.7,  3.8,  6.7,  2.2],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.9,  3.2,  5.7,  2.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 7.7,  2.8,  6.7,  2. ],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.7,  3.3,  5.7,  2.1],
       [ 7.2,  3.2,  6. ,  1.8],
       [ 6.2,  2.8,  4.8,  1.8],
       [ 6.1,  3. ,  4.9,  1.8],
       [ 6.4,  2.8,  5.6,  2.1],
       [ 7.2,  3. ,  5.8,  1.6],
       [ 7.4,  2.8,  6.1,  1.9],
       [ 7.9,  3.8,  6.4,  2. ],
       [ 6.4,  2.8,  5.6,  2.2],
       [ 6.3,  2.8,  5.1,  1.5],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.7,  3. ,  6.1,  2.3],
       [ 6.3,  3.4,  5.6,  2.4],
       [ 6.4,  3.1,  5.5,  1.8],
       [ 6. ,  3. ,  4.8,  1.8],
       [ 6.9,  3.1,  5.4,  2.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.9,  3.1,  5.1,  2.3],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 6.8,  3.2,  5.9,  2.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6.7,  3. ,  5.2,  2.3],
       [ 6.3,  2.5,  5. ,  1.9],
       [ 6.5,  3. ,  5.2,  2. ],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.9,  3. ,  5.1,  1.8]])

In [7]:
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable

In [8]:
for a, b in zip(x, y):
    print(a, b)


[ 1.4  0.2] 0
[ 1.4  0.2] 0
[ 1.3  0.2] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.7  0.4] 0
[ 1.4  0.3] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.5  0.1] 0
[ 1.5  0.2] 0
[ 1.6  0.2] 0
[ 1.4  0.1] 0
[ 1.1  0.1] 0
[ 1.2  0.2] 0
[ 1.5  0.4] 0
[ 1.3  0.4] 0
[ 1.4  0.3] 0
[ 1.7  0.3] 0
[ 1.5  0.3] 0
[ 1.7  0.2] 0
[ 1.5  0.4] 0
[ 1.   0.2] 0
[ 1.7  0.5] 0
[ 1.9  0.2] 0
[ 1.6  0.2] 0
[ 1.6  0.4] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 1.6  0.2] 0
[ 1.6  0.2] 0
[ 1.5  0.4] 0
[ 1.5  0.1] 0
[ 1.4  0.2] 0
[ 1.5  0.1] 0
[ 1.2  0.2] 0
[ 1.3  0.2] 0
[ 1.5  0.1] 0
[ 1.3  0.2] 0
[ 1.5  0.2] 0
[ 1.3  0.3] 0
[ 1.3  0.3] 0
[ 1.3  0.2] 0
[ 1.6  0.6] 0
[ 1.9  0.4] 0
[ 1.4  0.3] 0
[ 1.6  0.2] 0
[ 1.4  0.2] 0
[ 1.5  0.2] 0
[ 1.4  0.2] 0
[ 4.7  1.4] 1
[ 4.5  1.5] 1
[ 4.9  1.5] 1
[ 4.   1.3] 1
[ 4.6  1.5] 1
[ 4.5  1.3] 1
[ 4.7  1.6] 1
[ 3.3  1. ] 1
[ 4.6  1.3] 1
[ 3.9  1.4] 1
[ 3.5  1. ] 1
[ 4.2  1.5] 1
[ 4.  1.] 1
[ 4.7  1.4] 1
[ 3.6  1.3] 1
[ 4.4  1.4] 1
[ 4.5  1.5] 1
[ 4.1  1. ] 1
[ 4.5  1.5] 1
[ 3.9  1.1] 1
[ 4.8  1.8] 1
[ 4.   1.3] 1
[ 4.9  1.5] 1
[ 4.7  1.2] 1
[ 4.3  1.3] 1
[ 4.4  1.4] 1
[ 4.8  1.4] 1
[ 5.   1.7] 1
[ 4.5  1.5] 1
[ 3.5  1. ] 1
[ 3.8  1.1] 1
[ 3.7  1. ] 1
[ 3.9  1.2] 1
[ 5.1  1.6] 1
[ 4.5  1.5] 1
[ 4.5  1.6] 1
[ 4.7  1.5] 1
[ 4.4  1.3] 1
[ 4.1  1.3] 1
[ 4.   1.3] 1
[ 4.4  1.2] 1
[ 4.6  1.4] 1
[ 4.   1.2] 1
[ 3.3  1. ] 1
[ 4.2  1.3] 1
[ 4.2  1.2] 1
[ 4.2  1.3] 1
[ 4.3  1.3] 1
[ 3.   1.1] 1
[ 4.1  1.3] 1
[ 6.   2.5] 2
[ 5.1  1.9] 2
[ 5.9  2.1] 2
[ 5.6  1.8] 2
[ 5.8  2.2] 2
[ 6.6  2.1] 2
[ 4.5  1.7] 2
[ 6.3  1.8] 2
[ 5.8  1.8] 2
[ 6.1  2.5] 2
[ 5.1  2. ] 2
[ 5.3  1.9] 2
[ 5.5  2.1] 2
[ 5.  2.] 2
[ 5.1  2.4] 2
[ 5.3  2.3] 2
[ 5.5  1.8] 2
[ 6.7  2.2] 2
[ 6.9  2.3] 2
[ 5.   1.5] 2
[ 5.7  2.3] 2
[ 4.9  2. ] 2
[ 6.7  2. ] 2
[ 4.9  1.8] 2
[ 5.7  2.1] 2
[ 6.   1.8] 2
[ 4.8  1.8] 2
[ 4.9  1.8] 2
[ 5.6  2.1] 2
[ 5.8  1.6] 2
[ 6.1  1.9] 2
[ 6.4  2. ] 2
[ 5.6  2.2] 2
[ 5.1  1.5] 2
[ 5.6  1.4] 2
[ 6.1  2.3] 2
[ 5.6  2.4] 2
[ 5.5  1.8] 2
[ 4.8  1.8] 2
[ 5.4  2.1] 2
[ 5.6  2.4] 2
[ 5.1  2.3] 2
[ 5.1  1.9] 2
[ 5.9  2.3] 2
[ 5.7  2.5] 2
[ 5.2  2.3] 2
[ 5.   1.9] 2
[ 5.2  2. ] 2
[ 5.4  2.3] 2
[ 5.1  1.8] 2

In [9]:
y


Out[9]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [17]:
# shuffling data (which is X), and target (which is Y) and adding into two seperate lists
shuf_x = []
shuf_y = []
shuf_index = list(range(len(x)))
shuffle(shuf_index)
for i in shuf_index:
    shuf_x.append(x[i])
    shuf_y.append(y[i])

In [18]:
chunk_length = int(len(shuf_x)/ 5)
chunk_length


Out[18]:
30

In [19]:
chunk_length = int(len(shuf_y)/ 5)
chunk_length


Out[19]:
30

In [20]:
def chunks(l, num):
    num = max(1, num)
    return [l[i:i + num] for i in range(0, len(l), num)]

In [21]:
chunk_y = chunks(shuf_y, chunk_length)

In [22]:
chunk_x = chunks(shuf_x, chunk_length)

In [23]:
dt = tree.DecisionTreeClassifier()

In [30]:
Average_list = []

for x, y in zip(chunk_x, chunk_y):
     
    #Popping first item off the list
    x_test = chunk_x.pop(0)
    x_train = sum(chunk_x, [])
   
    #Adding it back on again
    chunk_x.append(x_test)
   
    #Popping first item off the list
    y_test = chunk_y.pop(0)
    y_train = sum(chunk_y, [])
   
    #Popping it back on again
    chunk_y.append(y_test)
   
    #fitting training
    dt = dt.fit(x_train,y_train)
   
    #Predicting
    y_pred=dt.predict(x_test)
   
    #Getting the accurancy score
Accuracy_score = accuracy_score(y_test, y_pred)
   
    #Creating a list of averages:
Average_list.append(Accuracy_score)

In [31]:
print(Average_list)


[0.93333333333333335]

Now we create our cross validation scores


In [2]:
from sklearn.cross_validation import cross_val_score

In [7]:
iris = datasets.load_iris()

In [8]:
x = iris.data[:,2:] 
y = iris.target

In [9]:
dt = tree.DecisionTreeClassifier()

In [10]:
dt = dt.fit(x,y)

In [11]:
# http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
scores = cross_val_score(dt,x,y,cv=5) #We're passing in our values and getting an array of values back

and dt is pass the decision tree classifier


In [12]:
scores


Out[12]:
array([ 0.96666667,  0.96666667,  0.9       ,  0.93333333,  1.        ])

In [13]:
import numpy as np

In [14]:
np.mean(scores) #here we get our average result


Out[14]:
0.95333333333333337

In [ ]: