In [20]:
from sklearn import datasets as d
import numpy as np
from sklearn import preprocessing

In [2]:
iris = d.load_iris()
iris_x = iris.data

In [3]:
masking_array = np.random.binomial(1,.25, iris_x.shape).astype(bool)

In [17]:
iris_x[masking_array] = np.nan

In [18]:
masking_array[:5]


Out[18]:
array([[ True,  True,  True, False],
       [ True,  True,  True, False],
       [False,  True, False, False],
       [False, False, False,  True],
       [False,  True, False,  True]], dtype=bool)

In [19]:
iris_x[:5]


Out[19]:
array([[ nan,  nan,  nan,  0.2],
       [ nan,  nan,  nan,  0.2],
       [ 4.7,  nan,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  nan],
       [ 5. ,  nan,  1.4,  nan]])

In [21]:
# could pass strategy='median' or 'most_frequent'
impute = preprocessing.Imputer()

In [23]:
iris_x_prime = impute.fit_transform(iris_x)

In [24]:
iris_x_prime[:5]


Out[24]:
array([[ 5.8754386 ,  3.07232143,  3.75203252,  0.2       ],
       [ 5.8754386 ,  3.07232143,  3.75203252,  0.2       ],
       [ 4.7       ,  3.07232143,  1.3       ,  0.2       ],
       [ 4.6       ,  3.1       ,  1.5       ,  1.14504505],
       [ 5.        ,  3.07232143,  1.4       ,  1.14504505]])

In [ ]: