This implementation only takes in a pandas dataframe and considers only the columns to keep as string objects.
Reasoning for only taking Pandas
Streams of data often come in chunks, (key pairs or otherwise) with no intrinsic ordering, hence assumption of order in the model matrix is probably inappropriate.
Dealing with numpy matrices (to do in the future)
Presume an order which is to be enforced - the easiest way is to:
column0, column1 ... etc)
In [1]:
import sklearn
In [2]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
import pandas as pd
import numpy as np
In [3]:
X, y = make_regression()
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(100)]
In [4]:
X.shape
Out[4]:
In [5]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]
In [6]:
class GraftingRegressor(SGDRegressor):
def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
shuffle=True, verbose=0, epsilon=0.1,
random_state=None, learning_rate="invscaling", eta0=0.01,
power_t=0.25, warm_start=False, average=False, n_iter=None, reg_penalty=None):
super(GraftingRegressor, self).__init__(loss=loss, penalty=penalty,
alpha=alpha, l1_ratio=l1_ratio,
fit_intercept=fit_intercept,
max_iter=max_iter, tol=tol,
shuffle=shuffle,
verbose=verbose,
epsilon=epsilon,
random_state=random_state,
learning_rate=learning_rate,
eta0=eta0, power_t=power_t,
warm_start=warm_start,
average=average, n_iter=n_iter)
self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
self.seen_cols = []
self.base_shape = None
self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
def add_column_exclusion(self, cols):
self.coef_info['excluded_cols'] = list(self.coef_info['excluded_cols']) + list(cols)
def _fit_columns(self, X_, return_x=True, transform_only=False):
"""
Method filter through "unselected" columns. The goal of this
method is to filter any uninformative columns.
This will be selected based on index only?
If return_x is false, it will only return the boolean mask.
"""
X = X_[X_.columns.difference(self.coef_info['excluded_cols'])]
# order the columns correctly...
col_order = self.coef_info['cols'] + list([x for x in X.columns if x not in self.coef_info['cols']])
X = X[col_order]
return X
def _reg_penalty(self, X):
col_coef = [(col, coef) for col, coef in zip(X.columns.tolist(), self.coef_) if np.abs(coef) >= self.reg_penalty]
self.coef_info['cols'] = [x for x, _ in col_coef]
self.coef_info['coef'] = [x for _, x in col_coef]
self.coef_info['excluded_cols'] = [x for x in self.seen_cols if x not in self.coef_info['cols']]
self.coef_ = np.array(self.coef_info['coef'])
def _partial_grafting_fit(self, X, y):
"""
Partial fit grafting method to expand the coefficient listing
to taking into account new coefficients
"""
n_samples, n_features = X.shape
coef_list = np.zeros(n_features, dtype=np.float64, order="C")
coef_list[:len(self.coef_info['coef'])] = self.coef_info['coef']
self.coef_ = coef_list.copy()
def fit(self, X, y, coef_init=None, intercept_init=None,
sample_weight=None):
self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
super(GraftingRegressor, self).fit(X, y, coef_init=coef_init, intercept_init=intercept_init,
sample_weight=sample_weight)
self._reg_penalty(X)
return self
def partial_fit(self, X, y, sample_weight=None):
self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
X = self._fit_columns(X)
self._partial_grafting_fit(X, y)
super(GraftingRegressor, self).partial_fit(X, y, sample_weight=sample_weight)
# update parameters based on weight of regularizer penalty
self._reg_penalty(X)
return self
def predict(self, X):
X = self._fit_columns(X, transform_only=True)
return super(GraftingRegressor, self).predict(X)
In [7]:
model = GraftingRegressor(max_iter=1000, l1_ratio=1.0)
model.fit(X1, y)
Out[7]:
In [8]:
len(model.coef_)
Out[8]:
In [9]:
model.partial_fit(pdf, y)
Out[9]:
In [10]:
len(model.coef_)
Out[10]:
In [11]:
model.coef_info.keys()
Out[11]:
In [12]:
len(model.coef_info['cols'])
Out[12]:
In [13]:
len(model.coef_info['coef'])
Out[13]:
In [14]:
len(model.coef_info['excluded_cols'])
Out[14]:
In [15]:
model.predict(pdf)
Out[15]: