In [1]:
# We will use Pipelines to put some of our data transformation
# processes together.
In [3]:
from sklearn.datasets import load_iris
import numpy as np
In [4]:
iris = load_iris()
In [5]:
iris_data = iris.data
In [6]:
mask = np.random.binomial(1, .25, iris_data.shape).astype(bool)
In [7]:
iris_data[mask] = np.nan
In [8]:
iris_data[:5]
Out[8]:
In [10]:
# the goal here is to first impute the missing values of the data
# then perform PCA on the corrected dataset.
# We will need to split this into a training set and holdout set
# but we will do that later.
In [11]:
from sklearn import pipeline, preprocessing, decomposition
In [12]:
pca = decomposition.PCA()
imputer = preprocessing.Imputer()
In [14]:
pipe = pipeline.Pipeline([('imputer', imputer), ('pca', pca)])
iris_data_transformed = pipe.fit_transform(iris_data)
In [15]:
iris_data_transformed[:5]
Out[15]:
In [16]:
# a quicker way to make a pipeline
pipe2 = pipeline.make_pipeline(imputer, pca)
pipe2.steps
Out[16]:
In [17]:
iris_data_transformed2 = pipe2.fit_transform(iris_data)
In [18]:
iris_data_transformed2[:5]
Out[18]:
In [19]:
# we can access the pipeline's attributes with:
# pipe2.set_params(<paramName>_<paramParam> = value)
In [ ]: