In [1]:
# This recipe will create the most important post-model validation
# exercise - cross validation.
# Steps:
# -> Create some data.
# -> Fit a classifier on the different folds.
# -> Create a holdout set of data for testing after cross-valid.
In [2]:
N = 1000
holdout = 200
from sklearn.datasets import make_regression
In [3]:
X, y = make_regression(N, shuffle=True)
In [4]:
X_h, y_h = X[:holdout], y[:holdout]
X_t, y_t = X[holdout:], y[holdout:]
In [5]:
from sklearn.cross_validation import KFold
In [6]:
# KFold gives us the option of choosing how many folds we want,
# if we want the values to be indices or Booleans, if we want to
# shuffle the dataset, and the random state (for reproducibility).
In [7]:
kfold = KFold(len(y_t), n_folds=4)
output_string = "Fold: {}, N_train: {}, N_test: {}"
In [8]:
for i, (train, test) in enumerate(kfold):
print output_string.format(i, len(y_t[train]), len(y_t[test]))
In [9]:
import numpy as np
import pandas as pd
In [13]:
patients = np.repeat(np.arange(0, 100, dtype=np.int8), 8)
In [14]:
measurements = pd.DataFrame({'patient_id': patients,
'ys': np.random.normal(0,1,800)})
In [15]:
custids = np.unique(measurements.patient_id)
customer_kfold = KFold(custids.size, n_folds=4)
In [17]:
for i, (train, test) in enumerate(customer_kfold):
train_cust_ids = custids[train]
training = measurements[measurements.patient_id.isin(
train_cust_ids)]
testing = measurements[~measurements.patient_id.isin(
train_cust_ids)]
print output_string.format(i, len(training), len(testing))
In [ ]: