In [1]:
import sklearn
In [2]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
import pandas as pd
import numpy as np
In [3]:
X, y = make_regression()
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(100)]
In [4]:
X.shape
Out[4]:
In [5]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]
In [103]:
class GraftingRegressor(SGDRegressor):
def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
shuffle=True, verbose=0, epsilon=0.1,
random_state=None, learning_rate="invscaling", eta0=0.01,
power_t=0.25, warm_start=False, average=False, n_iter=None, reg_penalty=None):
super(GraftingRegressor, self).__init__(loss=loss, penalty=penalty,
alpha=alpha, l1_ratio=l1_ratio,
fit_intercept=fit_intercept,
max_iter=max_iter, tol=tol,
shuffle=shuffle,
verbose=verbose,
epsilon=epsilon,
random_state=random_state,
learning_rate=learning_rate,
eta0=eta0, power_t=power_t,
warm_start=warm_start,
average=average, n_iter=n_iter)
self.filter_cols = []
self.base_shape = None
self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
def _fit_columns(self, X, return_x=True):
"""
Method filter through "unselected" columns. The goal of this
method is to filter any uninformative columns.
This will be selected based on index only?
If return_x is false, it will only return the boolean mask.
"""
import pandas
bool_mask = np.ones((X.shape[1],), dtype=np.bool)
if len(self.filter_cols) == 0:
if return_x:
return X
else:
return bool_mask
# otherwise...
bool_mask[self.filter_cols] = False
if not return_x:
return bool_mask
if type(X) is pandas.core.frame.DataFrame:
return X[X.columns[bool_mask]]
else:
return X[:, bool_mask]
def _reg_penalty(self, tot_new_feats, base_size):
remove_cols = np.argwhere(np.abs(self.coef_[-tot_new_feats:]) < self.reg_penalty)
add_cols = np.argwhere(np.abs(self.coef_[-tot_new_feats:]) >= self.reg_penalty)
base_coef = self.coef_[:-tot_new_feats].tolist()
# adding new coefs
base_coef = base_coef + self.coef_[-tot_new_feats:][add_cols].flatten().tolist()
self.coef_ = np.array(base_coef)
remove_cols_offset = [base_size + x for x in remove_cols]
self.filter_cols.append(remove_cols_offset)
def _partial_grafting_fit(self, X_, y):
"""
Partial fit grafting method to expand the coefficient listing
to taking into account new coefficients
"""
# require to know the base shape to determine/
# check for irrelevant columns in the future.
self.base_shape = self.coef_.shape[0]
X = self._fit_columns(X_)
n_samples, n_features = X.shape
coef_list = np.zeros(n_features, dtype=np.float64, order="C")
coef_list[:self.coef_.shape[0]] = self.coef_.copy()
self.coef_ = coef_list.copy()
def partial_fit(self, X, y, sample_weight=None):
base_size = len(self.filter_cols) + self.coef_.shape[0]
tot_new_feats = X.shape[1] - base_size
self._partial_grafting_fit(X, y)
super(GraftingRegressor, self).partial_fit(X, y, sample_weight=None)
# update parameters based on weight of regularizer penalty
self._reg_penalty(tot_new_feats, base_size)
return self
def predict(self, X):
X = self._fit_columns(X)
return super(GraftingRegressor, self).predict(X)
In [104]:
model.coef_[-5:][[0,3]].tolist() + model.coef_[-5:][[0,3]].tolist()
Out[104]:
In [105]:
model = GraftingRegressor(max_iter=1000)
model.fit(X1, y)
Out[105]:
In [106]:
len(model.coef_)
Out[106]:
In [107]:
model.partial_fit(pdf, y)
Out[107]:
In [108]:
len(model.coef_)
Out[108]:
In [109]:
model.predict(pdf)
Out[109]: