In [1]:
# We will use Pipelines to put some of our data transformation
# processes together.

In [3]:
from sklearn.datasets import load_iris
import numpy as np

In [4]:
iris = load_iris()

In [5]:
iris_data = iris.data

In [6]:
mask = np.random.binomial(1, .25, iris_data.shape).astype(bool)

In [7]:
iris_data[mask] = np.nan

In [8]:
iris_data[:5]


Out[8]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  nan,  0.2],
       [ 4.6,  3.1,  nan,  0.2],
       [ nan,  nan,  1.4,  0.2]])

In [10]:
# the goal here is to first impute the missing values of the data
# then perform PCA on the corrected dataset.
# We will need to split this into a training set and holdout set
# but we will do that later.

In [11]:
from sklearn import pipeline, preprocessing, decomposition

In [12]:
pca = decomposition.PCA()
imputer = preprocessing.Imputer()

In [14]:
pipe = pipeline.Pipeline([('imputer', imputer), ('pca', pca)])
iris_data_transformed = pipe.fit_transform(iris_data)

In [15]:
iris_data_transformed[:5]


Out[15]:
array([[-2.7590798 , -0.11631647,  0.21088932,  0.1556825 ],
       [-2.78804295,  0.1800981 , -0.052513  , -0.20752371],
       [-0.69816283,  1.24650022,  0.49804474,  0.26204272],
       [-0.72334002,  1.3548672 ,  0.42701603,  0.21127236],
       [-2.47234033, -0.64217883,  0.2705157 , -0.52311571]])

In [16]:
# a quicker way to make a pipeline
pipe2 = pipeline.make_pipeline(imputer, pca)
pipe2.steps


Out[16]:
[('imputer',
  Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)),
 ('pca', PCA(copy=True, n_components=None, whiten=False))]

In [17]:
iris_data_transformed2 = pipe2.fit_transform(iris_data)

In [18]:
iris_data_transformed2[:5]


Out[18]:
array([[-2.7590798 , -0.11631647,  0.21088932,  0.1556825 ],
       [-2.78804295,  0.1800981 , -0.052513  , -0.20752371],
       [-0.69816283,  1.24650022,  0.49804474,  0.26204272],
       [-0.72334002,  1.3548672 ,  0.42701603,  0.21127236],
       [-2.47234033, -0.64217883,  0.2705157 , -0.52311571]])

In [19]:
# we can access the pipeline's attributes with:
# pipe2.set_params(<paramName>_<paramParam> = value)

In [ ]: