This is a simple transformer mixin which removes features that are the same
In [117]:
from sklearn.datasets import make_regression, make_classification
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.decomposition import PCA
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils.validation import check_is_fitted
In [180]:
class RepeatedRemover(BaseEstimator, SelectorMixin):
"""
Repeated Remover removes columns that are the same.
"""
def fit(self, X, y=None):
self.indices_ = np.ones(X.shape[1], dtype=np.bool)
indx = []
indx_iter_list = np.triu_indices(X.shape[1], 1)
for i, j in zip(indx_iter_list[0].tolist(), indx_iter_list[1].tolist()):
if i in indx or j in indx:
continue
if np.array_equal(X[:,i], X[:, j]):
indx.append(j)
self.indices_[indx] = False
return self
def _get_support_mask(self):
check_is_fitted(self, 'indices_')
return self.indices_
In [181]:
class ColumnSelector(BaseEstimator, SelectorMixin):
"""
Repeated Remover removes columns that are the same.
"""
def __init__(self, columns=[], exclude=False):
self.columns = columns
self.exclude = exclude
def fit(self, X, y=None):
if type(self.columns) is not list:
self.columns = [self.columns]
self.indices_ = np.zeros(X.shape[1], dtype=np.bool)
indx = np.argwhere(X_df.columns.isin(self.columns)).flatten()
self.indices_[indx] = True
if self.exclude:
self.indices_ = ~self.indices_
return self
def _get_support_mask(self):
check_is_fitted(self, 'indices_')
return self.indices_
In [182]:
X = np.random.normal(size=(5, 3))
X_repeat = np.hstack([X, X])
rr = make_pipeline(RepeatedRemover())
rr.fit_transform(X_repeat).shape
Out[182]:
In [183]:
X_df = pd.DataFrame(X)
X_df.columns = ["col{}".format(x) for x in range(X_df.shape[1])]
In [184]:
cs = ColumnSelector(['col1'])
cs.fit_transform(X_df).shape
Out[184]:
In [185]:
cs = ColumnSelector(['col1'], exclude=True)
cs.fit_transform(X_df).shape
Out[185]:
We can force a column to be selected regardless of the previous step through FeatureUnion
. Combining this with RepeatedRemover
should ensure that the following dataset does not have duplicates.
In [186]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2
In [187]:
iris = load_iris()
X, y = iris.data, iris.target
In [188]:
X_df = pd.DataFrame(X)
X_df.columns = iris.feature_names
In [189]:
sk = SelectKBest(chi2, k=2)
sk.fit(X_df, y)
Out[189]:
In [190]:
sk.get_support()
Out[190]:
In [191]:
X_df.columns[sk.get_support()]
Out[191]:
In [192]:
# now grab `sepal length (cm)` and `petal length (cm)`
feat_sel = FeatureUnion([
('sk', SelectKBest(chi2, k=2)),
('force_sel', ColumnSelector(['sepal width (cm)', 'petal length (cm)']))
])
feat_sel.fit(X_df, y)
X_check = feat_sel.transform(X_df)
In [193]:
# this object has two columns which are now the same...
X_check
Out[193]:
In [194]:
feat_sel = Pipeline([('feature_sel', FeatureUnion([
('sk', SelectKBest(chi2, k=2)),
('force_sel', ColumnSelector(['sepal width (cm)', 'petal length (cm)']))
])),
('remover', RepeatedRemover())])
feat_sel.fit(X_df, y)
X_check = feat_sel.transform(X_df)
X_check.shape # should be 3
Out[194]: