This implementation only takes in a pandas dataframe and considers only the columns to keep as string objects.

Reasoning for only taking Pandas

Streams of data often come in chunks, (key pairs or otherwise) with no intrinsic ordering, hence assumption of order in the model matrix is probably inappropriate.

Dealing with numpy matrices (to do in the future)

Presume an order which is to be enforced - the easiest way is to:

  • Check if object is array, if it is:
    • Force column names in the logical way (e.g. column0, column1 ... etc)
    • Coerce as a dataframe
  • Proceed as normal

In [1]:
import sklearn

In [2]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor

import pandas as pd
import numpy as np

In [3]:
X, y = make_regression()
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(100)]

In [4]:
X.shape


Out[4]:
(100, 100)

In [5]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]

In [6]:
class GraftingRegressor(SGDRegressor):
    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
                 l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
                 shuffle=True, verbose=0, epsilon=0.1,
                 random_state=None, learning_rate="invscaling", eta0=0.01,
                 power_t=0.25, warm_start=False, average=False, n_iter=None, reg_penalty=None):
        super(GraftingRegressor, self).__init__(loss=loss, penalty=penalty,
                                           alpha=alpha, l1_ratio=l1_ratio,
                                           fit_intercept=fit_intercept,
                                           max_iter=max_iter, tol=tol,
                                           shuffle=shuffle,
                                           verbose=verbose,
                                           epsilon=epsilon,
                                           random_state=random_state,
                                           learning_rate=learning_rate,
                                           eta0=eta0, power_t=power_t,
                                           warm_start=warm_start,
                                           average=average, n_iter=n_iter)
        self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
        self.seen_cols = []
        self.base_shape = None
        self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
        
    def add_column_exclusion(self, cols):
        self.coef_info['excluded_cols'] = list(self.coef_info['excluded_cols']) + list(cols)
    
    def _fit_columns(self, X_, return_x=True, transform_only=False):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        X = X_[X_.columns.difference(self.coef_info['excluded_cols'])]
        
        # order the columns correctly...
        col_order = self.coef_info['cols'] + list([x for x in X.columns if x not in self.coef_info['cols']])
        X = X[col_order]
        return X
        
    def _reg_penalty(self, X):
        col_coef = [(col, coef) for col, coef in zip(X.columns.tolist(), self.coef_) if np.abs(coef) >= self.reg_penalty]
        self.coef_info['cols'] = [x for x, _ in col_coef]
        self.coef_info['coef'] = [x for _, x in col_coef]
        self.coef_info['excluded_cols'] = [x for x in self.seen_cols if x not in self.coef_info['cols']]
        self.coef_ = np.array(self.coef_info['coef'])
            
    def _partial_grafting_fit(self, X, y):
        """
        Partial fit grafting method to expand the coefficient listing
        to taking into account new coefficients
        """        
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:len(self.coef_info['coef'])] = self.coef_info['coef']
        self.coef_ = coef_list.copy()
    
    def fit(self, X, y, coef_init=None, intercept_init=None,
            sample_weight=None):
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        super(GraftingRegressor, self).fit(X, y, coef_init=coef_init, intercept_init=intercept_init,
            sample_weight=sample_weight)
        self._reg_penalty(X)
        return self
        
    def partial_fit(self, X, y, sample_weight=None):
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        X = self._fit_columns(X)
        self._partial_grafting_fit(X, y)
        super(GraftingRegressor, self).partial_fit(X, y, sample_weight=sample_weight)  
        
        # update parameters based on weight of regularizer penalty
        self._reg_penalty(X)
        return self
    
    def predict(self, X):
        X = self._fit_columns(X, transform_only=True)
        return super(GraftingRegressor, self).predict(X)

In [7]:
model = GraftingRegressor(max_iter=1000, l1_ratio=1.0)
model.fit(X1, y)


Out[7]:
GraftingRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
         fit_intercept=True, l1_ratio=1.0, learning_rate='invscaling',
         loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
         power_t=0.25, random_state=None, reg_penalty=1.0, shuffle=True,
         tol=None, verbose=0, warm_start=False)

In [8]:
len(model.coef_)


Out[8]:
49

In [9]:
model.partial_fit(pdf, y)


Out[9]:
GraftingRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
         fit_intercept=True, l1_ratio=1.0, learning_rate='invscaling',
         loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
         power_t=0.25, random_state=None, reg_penalty=1.0, shuffle=True,
         tol=None, verbose=0, warm_start=False)

In [10]:
len(model.coef_)


Out[10]:
57

In [11]:
model.coef_info.keys()


Out[11]:
dict_keys(['coef', 'cols', 'excluded_cols'])

In [12]:
len(model.coef_info['cols'])


Out[12]:
57

In [13]:
len(model.coef_info['coef'])


Out[13]:
57

In [14]:
len(model.coef_info['excluded_cols'])


Out[14]:
43

In [15]:
model.predict(pdf)


Out[15]:
array([-108.35791191,  207.91816625,  -31.47114295,  -51.47720674,
         46.87862919,  -35.5718505 ,  142.98270672,  133.55590398,
         -5.96245542,   75.94270026,  237.41777405,   28.68783597,
         65.38049609,  -78.37920503, -175.57978513,  -26.95706324,
        224.70945344, -245.22455315,   40.53589353,   -4.40814661,
         33.92163182, -131.88991859,  -98.27594562,    4.88781561,
       -239.76452488,  -13.33517415,  128.02397954,   34.39949925,
        -58.46164746,   21.09281941, -157.10914664,  185.2639206 ,
         46.94566028,  140.82046925,  -90.93960077,   84.62717298,
        -89.30355266, -233.15676383,  385.26502104,  -33.90056035,
       -255.85025649,   27.73267347,  -63.66160932, -305.26282284,
       -152.26960375,  116.83927432,  -46.2483672 , -165.38233289,
       -122.66969556, -358.08372963,  -40.14668753,  -92.9073549 ,
        209.26726795, -129.18168025,   80.805593  ,   -4.60250911,
       -145.61112123,   41.5809896 , -211.83619557,  -55.21152591,
         87.76505327, -103.95192244, -127.21578644,    8.63064065,
       -115.46305036, -205.08113286,   68.10018359,   27.01942254,
        156.00862167,   -9.74603082,   10.93598693, -270.55823989,
        -94.29402979, -196.46017663, -230.7591019 ,  -30.01558006,
         59.32705732,  -59.30663026,  156.83272818,  -73.86015553,
        155.70577987,  123.44576436, -172.08924536,  -58.46398174,
        137.19822603, -122.51896394, -114.12730308,   26.66347553,
        255.79238515, -177.5852836 ,   65.61203443,  -40.98944457,
        -21.82784615,  328.76406259, -342.33621918,   80.14761653,
        -90.0081762 , -247.0117391 , -281.31110234,    9.94641617])