In [1]:
import sklearn

In [2]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor

import pandas as pd
import numpy as np

In [3]:
X, y = make_regression()
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(100)]

In [4]:
X.shape


Out[4]:
(100, 100)

In [5]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]

In [10]:
"""
Implement DPP version that is similar to what is done above


sketch of solution
------------------

DPP requires a known number of parameters to check at each partial fit!


"""

class DPPRegressor(SGDRegressor):
    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
                 l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
                 shuffle=True, verbose=0, epsilon=0.1,
                 random_state=None, learning_rate="invscaling", eta0=0.01,
                 power_t=0.25, warm_start=False, average=False, n_iter=None):
        super(DPPRegressor, self).__init__(loss=loss, penalty=penalty,
                                           alpha=alpha, l1_ratio=l1_ratio,
                                           fit_intercept=fit_intercept,
                                           max_iter=max_iter, tol=tol,
                                           shuffle=shuffle,
                                           verbose=verbose,
                                           epsilon=epsilon,
                                           random_state=random_state,
                                           learning_rate=learning_rate,
                                           eta0=eta0, power_t=power_t,
                                           warm_start=warm_start,
                                           average=average, n_iter=n_iter)
        self.filter_cols = []
        self.base_shape = None
    
    def _fit_columns(self, X, return_x=True):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        import pandas
        bool_mask = np.ones((X.shape[1],), dtype=np.bool)
        if len(self.filter_cols) == 0:
            if return_x:
                return X
            else:
                return bool_mask
        # otherwise...
        bool_mask[self.filter_cols] = False
        if not return_x:
            return bool_mask
        if type(X) is pandas.core.frame.DataFrame:
            return X[X.columns[bool_mask]]
        else:
            return X[:, bool_mask]
    
    def _partial_dpp_fit(self, X, y):
        """
        we have to perform conditional dpp here - we will take in the dataset
        and then we will sample according to condition??
        """
        self.base_shape = self.coef_.shape[0]
        
        """
        Before the fit columns is called - we should perform sampling/masking
        update so that not all columns are chosen
        """
        X = self._fit_columns(X)
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:self.coef_.shape[0]] = self.coef_.copy()
        self.coef_ = coef_list.copy()
        
        
    def partial_fit(self, X, y, sample_weight=None):
        self._partial_dpp_fit(X, y)
        super(DPPRegressor, self).partial_fit(X, y, sample_weight=None)  
        return self
    
    def predict(self, X):
        X = self._fit_columns(X)
        return super(DPPRegressor, self).predict(X)

In [11]:
model = DPPRegressor(max_iter=1000)
model.fit(X1, y)


Out[11]:
DPPRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [12]:
len(model.coef_)


Out[12]:
50

In [13]:
model.partial_fit(pdf, y)


Out[13]:
DPPRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [14]:
len(model.coef_)


Out[14]:
100

In [15]:
model.predict(pdf)


Out[15]:
array([-215.68090643, -171.12348582,  -86.5133144 , -206.42923396,
        199.36330671, -277.10036819, -124.91231933,  -51.79150044,
         19.01488922,  268.10757417, -256.71295929,  -29.47206108,
        135.80009473,  135.36187193,   68.66074387,   82.57113691,
        148.19926614, -114.57455919,    2.72501381,   39.31876302,
        196.08499757,  -63.31608078, -288.63349941,    2.13140069,
         37.0888929 ,  141.24296281,   37.21294871, -255.98162184,
        263.43541023, -105.15605908, -174.81860156,  -10.06764571,
        -76.1533631 ,   28.53949232, -158.57791312, -200.7821329 ,
        197.80912931, -411.57004559, -338.36618913, -134.84274097,
       -113.74300531, -195.82821768,  -54.82063278,   64.52205131,
        195.45766935,  134.57796755,   13.93516877,  150.03431086,
         76.89292196, -257.34039213,   -3.20981957, -110.36049994,
        -48.33271121,  115.27725093,  -79.3158718 , -133.53034614,
          4.51964811,  139.22359803,  116.41288154,  -13.85101037,
        -18.38577379,   50.60562216, -149.08917181,  103.43543962,
         21.8560818 , -151.62957747,  195.39606089,  234.93797178,
        200.57531108,  -84.98114743,  152.30626309,  180.7860101 ,
       -114.05672781,   10.54757379,  159.46798155, -217.97440215,
       -203.92419957,   60.00905389, -178.47094565,  172.00093528,
       -121.88258162,  -92.79638219, -124.0584415 ,  -20.23036222,
         62.33783162, -225.38100009, -258.01107118, -294.59753333,
         66.97186478, -239.01942258, -141.77808458,   64.21837455,
         74.66078434,   28.74899043, -326.32486668,   70.56650819,
         -2.86268203,  -74.4647368 ,   36.69288794,  144.96090556])

In [ ]: