notebook.community

Edit and run



In [1]:

    
# This recipe will create the most important post-model validation
# exercise - cross validation.
# Steps:
#  -> Create some data.
#  -> Fit a classifier on the different folds.
#  -> Create a holdout set of data for testing after cross-valid.



In [2]:

    
N = 1000
holdout = 200
from sklearn.datasets import make_regression



In [3]:

    
X, y = make_regression(N, shuffle=True)



In [4]:

    
X_h, y_h = X[:holdout], y[:holdout]
X_t, y_t = X[holdout:], y[holdout:]



In [5]:

    
from sklearn.cross_validation import KFold



In [6]:

    
# KFold gives us the option of choosing how many folds we want,
# if we want the values to be indices or Booleans, if we want to
# shuffle the dataset, and the random state (for reproducibility).



In [7]:

    
kfold = KFold(len(y_t), n_folds=4)
output_string = "Fold: {}, N_train: {}, N_test: {}"



In [8]:

    
for i, (train, test) in enumerate(kfold):
    print output_string.format(i, len(y_t[train]), len(y_t[test]))









    



Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200



In [9]:

    
import numpy as np
import pandas as pd



In [13]:

    
patients = np.repeat(np.arange(0, 100, dtype=np.int8), 8)



In [14]:

    
measurements = pd.DataFrame({'patient_id': patients,
                             'ys': np.random.normal(0,1,800)})



In [15]:

    
custids = np.unique(measurements.patient_id)
customer_kfold = KFold(custids.size, n_folds=4)



In [17]:

    
for i, (train, test) in enumerate(customer_kfold):
    train_cust_ids = custids[train]
    training = measurements[measurements.patient_id.isin(
        train_cust_ids)]
    testing = measurements[~measurements.patient_id.isin(
        train_cust_ids)]
    print output_string.format(i, len(training), len(testing))









    



Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200



In [ ]: