In [1]:
# This recipe will create the most important post-model validation
# exercise - cross validation.
# Steps:
#  -> Create some data.
#  -> Fit a classifier on the different folds.
#  -> Create a holdout set of data for testing after cross-valid.

In [2]:
N = 1000
holdout = 200
from sklearn.datasets import make_regression

In [3]:
X, y = make_regression(N, shuffle=True)

In [4]:
X_h, y_h = X[:holdout], y[:holdout]
X_t, y_t = X[holdout:], y[holdout:]

In [5]:
from sklearn.cross_validation import KFold

In [6]:
# KFold gives us the option of choosing how many folds we want,
# if we want the values to be indices or Booleans, if we want to
# shuffle the dataset, and the random state (for reproducibility).

In [7]:
kfold = KFold(len(y_t), n_folds=4)
output_string = "Fold: {}, N_train: {}, N_test: {}"

In [8]:
for i, (train, test) in enumerate(kfold):
    print output_string.format(i, len(y_t[train]), len(y_t[test]))


Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200

In [9]:
import numpy as np
import pandas as pd

In [13]:
patients = np.repeat(np.arange(0, 100, dtype=np.int8), 8)

In [14]:
measurements = pd.DataFrame({'patient_id': patients,
                             'ys': np.random.normal(0,1,800)})

In [15]:
custids = np.unique(measurements.patient_id)
customer_kfold = KFold(custids.size, n_folds=4)

In [17]:
for i, (train, test) in enumerate(customer_kfold):
    train_cust_ids = custids[train]
    training = measurements[measurements.patient_id.isin(
        train_cust_ids)]
    testing = measurements[~measurements.patient_id.isin(
        train_cust_ids)]
    print output_string.format(i, len(training), len(testing))


Fold: 0, N_train: 600, N_test: 200
Fold: 1, N_train: 600, N_test: 200
Fold: 2, N_train: 600, N_test: 200
Fold: 3, N_train: 600, N_test: 200

In [ ]: