In [ ]:
"""
IPython Notebook v4.0 para python 2.7
Librerías adicionales: numpy, matplotlib
Contenido bajo licencia CC-BY 4.0. Código bajo licencia MIT. (c) Sebastian Flores.
"""
# Configuracion para recargar módulos y librerías
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.display import HTML
HTML(open("style/mat281.css", "r").read())
Panchulo es un minero, debe medir el pilar que sostiene el techo de una galería para verificar que la sección transversal del pilar es el indicado.
¿Cuáles son las posibles fuentes de error?
La medición del error de predicción no necesita conocer el modelo subyacente: puede ser un sofware externo, una black box o una implementación casera.
Necesitamos:
El holdout set no debe utilizarse para nada más que el testeo final del modelo, sus datos no deben utilizarse para calibrar el modelo en ningún aspecto.
Si los datos reservados para predicción se han utilizan durante el entrenamiento, el error predictivo del modelo estará sesgado al incluir el error de entrenamiento/calibración del modelo.
Ilustraremos el funcionamiento del método con datos sintéticos: $$ y(x) = 5 \cos \Big( \frac{\pi}{4} x \Big) + \mathcal{N}\Big(0,1\Big)$$
Buscaremos ajustar un modelo del tipo $$ y(x) = a \cos \Big( b x + c\Big) + d$$ minimizando el error cuadrático.
El error predictivo del modelo será calculado utilizando RMSE: $$ E(o,p) = \sqrt{ \frac{1}{N}\sum_{i=1}^N (o_i - p_i)^2 }$$ El RMSE corresponde a la desviación estándar de los residuos.
La implementación numérica del holdout set no depende del modelo a ajustar: puede realizarse para regresion lineal o logística, modelos discretos, algoritmos de machine learning, etc.
Los pasos son:
In [ ]:
import numpy as np
from mat281_code import model
# Load data
data = model.load_data("data/dataN5000.txt") # Change here
N = data.shape[0]
split = int(0.7*N) # Change here
# Permute the data
np.random.seed(23) # Change here
data = np.random.permutation(data)
# Do the split
training_data = data[:split,:]
testing_data = data[split:,:]
# Train model excluding the holdout set
training_params = model.get_params(training_data)
# Test with the holdout set
prediction_error = model.get_error(training_params, testing_data)
print "Prediction error estimated on ", prediction_error, "\n"
# Train model with all the data
all_data_params = model.get_params(data)
# Report
model.full_report(training_data, testing_data, training_params, all_data_params)
# Plot the model
model.plot(training_data, testing_data, training_params, all_data_params)
¿Cuánto afecta la selección de la partición al error de predicción?
Si tenemos $N$ datos y queremos un split de $n$ datos de entrenamiento y $N-n$ datos de testing, existen $ C^N_n = \frac{N!}{n!(N-n)!}$ posibilidades.
En nuestro caso, para $N=50$ y $n=0.7 \times 50 = 35$, tenemos $>2.5 \cdot 10^{12}$ posibles combinaciones distintas de datos.
In [ ]:
from scipy.misc import comb
print comb(50,35)
print comb(50,15)
In [ ]:
import numpy as np
from mat281_code import model
from matplotlib import pyplot as plt
# Number of different seeds to test
total_seeds = 250
# Load data
data = model.load_data("data/dataN5000.txt")
N = data.shape[0]
split = int(0.7*N)
prediction_error = np.zeros(total_seeds)
for seed in range(total_seeds):
# Permute the data
np.random.seed(seed)
data = np.random.permutation(data)
# Do the split
training_data = data[:split,:]
testing_data = data[split:,:]
# Train model excluding the holdout set
training_params = model.get_params(training_data)
# Test with the holdout set
prediction_error[seed] = model.get_error(training_params, testing_data)
# Histogram on the prediction error
fig = plt.figure(figsize=(16,8))
plt.hist(prediction_error, rwidth=0.8)
plt.xlim(0,3)
plt.show()
In [ ]:
import numpy as np
from mat281_code import model
from matplotlib import pyplot as plt
# Number of different seeds to test
total_seeds = 250
datasizes = (20,50,100,500,5000)
mean_prediction_error = np.zeros(len(datasizes))
std_prediction_error = np.zeros(len(datasizes))
for i, N in enumerate(datasizes):
# Load data
datafile = "data/dataN%d.txt" %N
data = model.load_data(datafile)
split = int(0.7*N)
prediction_error = np.zeros(total_seeds)
for seed in range(total_seeds):
# Permute the data
np.random.seed(seed)
data = np.random.permutation(data)
# Do the split
training_data = data[:split,:]
testing_data = data[split:,:]
# Train model excluding the holdout set
training_params = model.get_params(training_data)
# Test with the holdout set
prediction_error[seed] = model.get_error(training_params,
testing_data)
# Compute mean and std, and save into vector
mean_prediction_error[i] = prediction_error.mean()
std_prediction_error[i] = prediction_error.std()
# Histogram on the prediction error
fig = plt.figure(figsize=(16,8))
plt.plot(datasizes, mean_prediction_error, 'b', lw=2.0)
plt.plot(datasizes, mean_prediction_error-std_prediction_error, 'b:', lw=2.0)
plt.plot(datasizes, mean_prediction_error+std_prediction_error, 'b:', lw=2.0)
plt.xlabel("Data size")
plt.ylabel("Prediction Error")
plt.show()
In [ ]:
import numpy as np
from mat281_code import model
from matplotlib import pyplot as plt
# Number of different seeds to test
total_seeds = 250
# Load data
data = model.load_data("data/dataN5000.txt") # Change here
N = data.shape[0]
split = int(0.7*N) # Change here
training_error = np.zeros(total_seeds)
prediction_error = np.zeros(total_seeds)
for seed in range(total_seeds):
# Permute the data
np.random.seed(seed)
data = np.random.permutation(data)
# Do the split
training_data = data[:split,:]
testing_data = data[split:,:]
# Train model excluding the holdout set
training_params = model.get_params(training_data)
# Test with the holdout set
training_error[seed] = model.get_error(training_params, training_data)
prediction_error[seed] = model.get_error(training_params, testing_data)
# Histogram on the prediction error
fig = plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
plt.hist(training_error, rwidth=0.8, color="red")
plt.ylabel("Training Error")
plt.xlim(0,3)
plt.subplot(2,1,2)
plt.hist(prediction_error, rwidth=0.8)
plt.ylabel("Prediction Error")
plt.xlim(0,3)
plt.show()
In [ ]:
import numpy as np
from mat281_code import model
from matplotlib import pyplot as plt
# Number of different seeds to test
total_seeds = 100
N = 5000 # N>=50
splitsizes = np.linspace(0.1,0.9,9)
mean_prediction_error = np.zeros(len(splitsizes))
std_prediction_error = np.zeros(len(splitsizes))
mean_training_error = np.zeros(len(splitsizes))
std_training_error = np.zeros(len(splitsizes))
for i, s in enumerate(splitsizes):
# Load data
datafile = "data/dataN%d.txt" %N
data = model.load_data(datafile)
split = int(s*N)
split = min(max(split, 4), N-4) # Requirement for optimization
prediction_error = np.zeros(total_seeds)
training_error = np.zeros(total_seeds)
for seed in range(total_seeds):
# Permute the data
np.random.seed(seed)
data = np.random.permutation(data)
# Do the split
training_data = data[:split,:]
testing_data = data[split:,:]
# Train model excluding the holdout set
training_params = model.get_params(training_data)
# Test with the holdout set
training_error[seed] = model.get_error(training_params, training_data)
prediction_error[seed] = model.get_error(training_params, testing_data)
# Compute mean and std, and save into vector
mean_training_error[i] = training_error.mean()
std_training_error[i] = training_error.std()
mean_prediction_error[i] = prediction_error.mean()
std_prediction_error[i] = prediction_error.std()
# Histogram on the prediction error
fig = plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
plt.plot(splitsizes, mean_training_error, 'r', lw=2.0)
plt.plot(splitsizes, mean_training_error-std_training_error, 'r:', lw=2.0)
plt.plot(splitsizes, mean_training_error+std_training_error, 'r:', lw=2.0)
plt.xlabel("Split size for training")
plt.ylabel("Training Error")
plt.subplot(2,1,2)
plt.plot(splitsizes, mean_prediction_error, 'b', lw=2.0)
plt.plot(splitsizes, mean_prediction_error-std_prediction_error, 'b:', lw=2.0)
plt.plot(splitsizes, mean_prediction_error+std_prediction_error, 'b:', lw=2.0)
plt.xlabel("Split size for training")
plt.ylabel("Prediction Error")
plt.show()
El holdout se ve afectado: