In [16]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

In [17]:
# random seed is 0.
rng = np.random.RandomState(0)

In [18]:
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

In [19]:
# Estimate the score on the entire dataset, with no missing values
# If int, random_state is the seed used by the random number generator
# n_estimators is the number of trees in the forest.
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)


Score with the entire dataset = 0.56

In [22]:
# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
# Stack arrays in sequence horizontally (column wise)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
missing_samples


Out[22]:
array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [33]:
rng.shuffle(missing_samples)
missing_samples


Out[33]:
array([ True, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True, False,  True,  True, False, False,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False, False,  True,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False,  True, False,  True,  True,
        True,  True, False, False, False,  True, False,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
       False, False,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False, False,  True, False,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True, False,  True,  True,  True,
       False, False, False,  True,  True,  True, False,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True, False,
        True, False,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False, False,  True,  True, False,
        True,  True, False, False,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True, False, False,  True,  True,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True, False,  True,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False,  True,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True, False,  True, False,  True, False,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False], dtype=bool)

In [34]:
# randint(low, high=None, size=None)
# Return random integers from low (inclusive) to high (exclusive).
missing_features = rng.randint(0, n_features, n_missing_samples)
missing_features


Out[34]:
array([11,  1,  9, 12,  9, 11,  9,  0, 12,  9,  9, 12,  0,  5,  4, 11,  3,
        8,  9, 12,  2,  0, 12,  4,  8, 10,  3, 11,  9,  7,  5,  1,  3, 10,
        0,  4,  1,  3,  4,  8,  8,  9,  9,  5, 10,  0,  3,  5,  8,  0, 10,
        5,  5,  8, 11,  2,  0,  7,  9, 12,  7,  0,  3, 11,  1,  0,  8,  0,
       11, 10,  6,  2,  1,  3,  4,  5,  1,  6, 10, 12,  9,  1,  3,  8,  2,
       10,  6,  0,  4,  4, 12,  0,  3,  3,  6,  7,  6,  7,  1, 10,  6,  8,
       11,  2,  5, 12,  0,  2, 11, 12,  5,  9,  9,  9,  3,  0,  5,  2,  3,
       10,  3, 12, 11,  9,  0,  6, 10, 12,  6,  1,  6,  5,  6, 12,  2, 11,
        0,  1,  8, 10, 12,  0,  4,  8,  2, 12,  7,  8,  0,  6, 10,  4,  0,
        5,  7,  1,  2,  0, 10, 11,  8,  9,  5,  6,  6,  3,  2, 12, 12, 10,
       12,  4,  1, 11,  8, 11,  6,  2,  9,  5, 10, 11,  1,  6,  9,  6, 11,
        0,  3,  9,  7,  9,  0,  6,  8,  2, 10,  9, 10,  8, 10, 10,  1, 11,
        3,  4,  4,  1,  1,  5, 11,  9,  6,  8,  1, 10,  9, 11,  0, 12,  0,
        0,  2,  8,  1, 11,  0,  8,  1,  6,  0, 12,  8,  8,  0, 12, 12, 12,
       12,  8,  9,  3,  3,  0, 10, 11,  7,  4,  2,  1,  6, 10,  9,  2,  3,
        4,  9,  9,  7,  2, 12,  0,  1,  6, 12,  8,  7,  9, 12,  7,  1,  3,
       12,  2,  7,  2,  8,  7,  6,  3,  5,  0, 10,  4,  5, 11,  9,  6,  5,
        1, 12,  6, 10,  5,  0, 10, 12, 10,  8,  8,  3,  7, 11, 11,  5,  2,
        9, 10,  5,  0, 11,  4,  9,  6,  2,  6,  1,  4,  7,  3, 11,  7,  0,
        5,  2,  8,  9, 12,  5, 12,  0,  6,  0,  1,  3,  4,  6,  0,  7,  4,
        3, 12,  3,  3,  2,  1, 12,  3, 12,  8,  1,  9,  8,  8,  4,  6, 10,
       12, 12,  3,  6, 12,  9,  5, 10,  8,  9,  0,  7, 11,  6,  4, 11,  8,
        4,  5,  5,  6, 12])

In [41]:
# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)


Score without the samples containing missing values = 0.31

In [48]:
# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
# If axis=0, then impute along columns.If axis=1, then impute along rows.
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)


Score after imputation of the missing values = 0.56