notebook.community

Edit and run



In [1]:

    
# We will use Pipelines to put some of our data transformation
# processes together.



In [3]:

    
from sklearn.datasets import load_iris
import numpy as np



In [4]:

    
iris = load_iris()



In [5]:

    
iris_data = iris.data



In [6]:

    
mask = np.random.binomial(1, .25, iris_data.shape).astype(bool)



In [7]:

    
iris_data[mask] = np.nan



In [8]:

    
iris_data[:5]









    Out[8]:





array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  nan,  0.2],
       [ 4.6,  3.1,  nan,  0.2],
       [ nan,  nan,  1.4,  0.2]])



In [10]:

    
# the goal here is to first impute the missing values of the data
# then perform PCA on the corrected dataset.
# We will need to split this into a training set and holdout set
# but we will do that later.



In [11]:

    
from sklearn import pipeline, preprocessing, decomposition



In [12]:

    
pca = decomposition.PCA()
imputer = preprocessing.Imputer()



In [14]:

    
pipe = pipeline.Pipeline([('imputer', imputer), ('pca', pca)])
iris_data_transformed = pipe.fit_transform(iris_data)



In [15]:

    
iris_data_transformed[:5]









    Out[15]:





array([[-2.7590798 , -0.11631647,  0.21088932,  0.1556825 ],
       [-2.78804295,  0.1800981 , -0.052513  , -0.20752371],
       [-0.69816283,  1.24650022,  0.49804474,  0.26204272],
       [-0.72334002,  1.3548672 ,  0.42701603,  0.21127236],
       [-2.47234033, -0.64217883,  0.2705157 , -0.52311571]])



In [16]:

    
# a quicker way to make a pipeline
pipe2 = pipeline.make_pipeline(imputer, pca)
pipe2.steps









    Out[16]:





[('imputer',
  Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)),
 ('pca', PCA(copy=True, n_components=None, whiten=False))]



In [17]:

    
iris_data_transformed2 = pipe2.fit_transform(iris_data)



In [18]:

    
iris_data_transformed2[:5]









    Out[18]:





array([[-2.7590798 , -0.11631647,  0.21088932,  0.1556825 ],
       [-2.78804295,  0.1800981 , -0.052513  , -0.20752371],
       [-0.69816283,  1.24650022,  0.49804474,  0.26204272],
       [-0.72334002,  1.3548672 ,  0.42701603,  0.21127236],
       [-2.47234033, -0.64217883,  0.2705157 , -0.52311571]])



In [19]:

    
# we can access the pipeline's attributes with:
# pipe2.set_params(<paramName>_<paramParam> = value)



In [ ]: