In [1]:
import sklearn
import pandas as pd
import numpy as np
import scipy
In [2]:
sklearn.__version__, pd.__version__, np.__version__, scipy.__version__
Out[2]:
In [3]:
from sklearn.pipeline import Pipeline
class DataframeFunctionTransformer():
def __init__(self, func):
self.func = func
def transform(self, input_df, **transform_params):
return self.func(input_df)
def fit(self, X, y=None, **fit_params):
return self
In [4]:
def process_dataframe(input_df):
input_df["text"] = input_df["text"].map(lambda t: t.upper())
return input_df
In [5]:
df = pd.DataFrame({
"id":[1,2,3,4],
"text":["foo","Bar","BAz","quux"]
})
In [6]:
df
Out[6]:
In [7]:
pipeline = Pipeline([
("lowercase", DataframeFunctionTransformer(process_dataframe))
])
In [8]:
pipeline.fit_transform(df)
Out[8]:
In [9]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
In [10]:
data = scipy.sparse.csr_matrix([
[1.,0.,0.,0.,0.,0.],
[0.,1.,0.,0.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[0.,0.,0.,0.,1.,0.],
[0.,0.,0.,1.,0.,0.],
[1.,0.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
[1.,1.,0.,0.,0.,0.],
])
target = np.array([1,1,1,0,0,0,1,1])
In [11]:
class ToDenseTransformer():
# here you define the operation it should perform
def transform(self, X, y=None, **fit_params):
return X.todense()
# just return self
def fit(self, X, y=None, **fit_params):
return self
# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
('to_dense',ToDenseTransformer()),
('pca',PCA()),
('clf',DecisionTreeClassifier())
])
pipeline.fit(data,target)
pipeline.predict(data)
Out[11]:
In [12]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
In [13]:
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
Out[13]:
In [14]:
transformer_step = ColumnTransformer([
('impute_mean', SimpleImputer(strategy='mean'), ['age'])
], remainder='passthrough')
In [15]:
pipe = Pipeline([
('transformer', transformer_step)
])
In [16]:
pipe.fit(df)
pd.DataFrame(
data=pipe.transform(df),
columns=['age', 'name']
)[["name","age"]]
Out[16]:
In [17]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
In [18]:
df = pd.DataFrame({
'favorite_color':['blue','green','red','green','blue'],
'age': [10,15,10,np.nan,10],
'target':[1,0,1,0,1]
})
In [19]:
df
Out[19]:
In [20]:
# define individual transformers in a pipeline
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])
# define which transformer applies to which columns
preprocess = ColumnTransformer([
('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
('numerical_preprocessing', numerical_preprocessing, ['age'])
])
# create the final pipeline with preprocessing steps and
# the final classifier step
pipeline = Pipeline([
('preprocess', preprocess),
('clf', DecisionTreeClassifier())
])
# now fit the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']
pipeline.fit(df_features, df_target)
Out[20]:
In [21]:
import pandas as pd
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
In [22]:
class SelectColumnsTransfomer():
def __init__(self, columns=None):
self.columns = columns
def transform(self, X, **transform_params):
cpy_df = X[self.columns].copy()
return cpy_df
def fit(self, X, y=None, **fit_params):
return self
In [23]:
df = pd.DataFrame({
'name':['alice','bob','charlie','david','edward'],
'age':[24,32,np.nan,38,20]
})
df.head()
Out[23]:
In [24]:
pipe = Pipeline([
('selector', SelectColumnsTransfomer(["name"]))
])
In [25]:
pipe.fit_transform(df)
Out[25]:
In [ ]: