In this implementation of alpha-investing we will use

mse and log loss as measures of likelihood for likelihood-ratio test


In [6]:
import sklearn

In [7]:
from sklearn.datasets import make_regression, make_classification
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import log_loss, mean_squared_error
from scipy.stats import chi2, norm

import pandas as pd
import numpy as np

In [8]:
X, y = make_regression(n_features=100)
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(X.shape[1])]

In [9]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]

In [20]:
def get_p_val(X, col, y):
    # use log loss to calculate change in likelihood
    col = np.array(col)
    X = np.array(X)
    new_X = np.hstack([X, col.reshape(-1, 1)])
    mod = SGDRegressor(max_iter=1000)
    nul_y = mod.fit(X, y).predict(X)
    alt_y = mod.fit(new_X, y).predict(new_X)
    
    # calculate log loss
    nul_ll = mean_squared_error(y, nul_y)
    alt_ll = mean_squared_error(y, alt_y)
    return chi2.cdf(2*(nul_ll-alt_ll), 1)

In [21]:
import pandas

class AlphaInvestRegressor(SGDRegressor):
    def __init__(self, loss="squared_loss", penalty="l1", alpha=0.0001,
                 l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
                 shuffle=True, verbose=0, epsilon=0.1,
                 random_state=None, learning_rate="invscaling", eta0=0.01,
                 power_t=0.25, warm_start=False, average=False, n_iter=None, 
                 wealth=0.5, delta_alpha=0.5):
        super(AlphaInvestRegressor, self).__init__(loss=loss, penalty=penalty,
                                           alpha=alpha, l1_ratio=l1_ratio,
                                           fit_intercept=fit_intercept,
                                           max_iter=max_iter, tol=tol,
                                           shuffle=shuffle,
                                           verbose=verbose,
                                           epsilon=epsilon,
                                           random_state=random_state,
                                           learning_rate=learning_rate,
                                           eta0=eta0, power_t=power_t,
                                           warm_start=warm_start,
                                           average=average, n_iter=n_iter)
        """
        intragroup_alpha : the alpha level of t-test used to determine significance
        intergroup_thres : the threshold for lasso to remove redundancy
        """
        self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
        self.seen_cols = []
        self.base_shape = None
        self.delta_alpha = delta_alpha
        self.wealth = wealth
        self.temp_X = None
    
    def add_column_exclusion(self, cols):
        self.coef_info['excluded_cols'] = self.coef_info['excluded_cols'] + cols
    
    def _alpha_sel(self, X_, y):
        # need to continually refit...
        new_X = X_[X_.columns.difference(self.coef_info['cols'])]
        col_list = self.coef_info['cols'][:]
                
        for col in list(new_X.columns):
            # perform test with and without dataset...
            if col in self.coef_info['cols']:
                continue
            temp_X = X_[col_list]
            # evaluate this column
            pval = get_p_val(temp_X, new_X[[col]], y)
            alpha = self.wealth/(2*temp_X.shape[1])
            if pval < alpha:
                col_list.append(col)
                self.wealth = self.wealth + self.delta_alpha - alpha
            else:
                self.wealth = self.wealth - alpha
        
        # add and update cols...        
        sel_cols = list(self.coef_info['cols']) + list(col_list)
        self.coef_info['excluded_cols'] = [col for col in self.seen_cols if col not in sel_cols]
        
    def _fit_columns(self, X_, return_x=True, transform_only=False):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        X = X_[X_.columns.difference(self.coef_info['excluded_cols'])]
        
        # order the columns correctly...
        col_order = self.coef_info['cols'] + list([x for x in X.columns if x not in self.coef_info['cols']])
        X = X[col_order]
        return X

    def _reg_penalty(self, X):
        col_coef = [(col, coef) for col, coef in zip(X.columns.tolist(), self.coef_) if np.abs(coef) >= -0.1]
        self.coef_info['cols'] = [x for x, _ in col_coef]
        self.coef_info['coef'] = [x for _, x in col_coef]
        self.coef_info['excluded_cols'] = [x for x in self.seen_cols if x not in self.coef_info['cols']]
        self.coef_ = np.array(self.coef_info['coef'])

    
    def fit(self, X, y, coef_init=None, intercept_init=None,
            sample_weight=None):
        X_ = X.copy()
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        super(AlphaInvestRegressor, self).fit(X, y, coef_init=coef_init, intercept_init=intercept_init,
            sample_weight=sample_weight)
        # update params in self...
        self._reg_penalty(X)
        return self
    
    def partial_fit(self, X, y, sample_weight=None):
        X_ = X.copy()
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
                
        # TODO: alpha investing
        self._alpha_sel(X, y)
        X = self._fit_columns(X)
        
        # now update coefficients
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:len(self.coef_info['coef'])] = self.coef_info['coef']
        self.coef_ = np.array(coef_list)
        
        super(AlphaInvestRegressor, self).partial_fit(X, y, sample_weight=None)  
        self._reg_penalty(X)
        return self
    
    def predict(self, X):
        X = self._fit_columns(X, transform_only=True)
        print(X.shape)
        return super(AlphaInvestRegressor, self).predict(X)

In [29]:
model = AlphaInvestRegressor(max_iter=1000)
model.fit(X1, y)


Out[29]:
AlphaInvestRegressor(alpha=0.0001, average=False, delta_alpha=0.5,
           epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
           learning_rate='invscaling', loss='squared_loss', max_iter=1000,
           n_iter=None, penalty='l1', power_t=0.25, random_state=None,
           shuffle=True, tol=None, verbose=0, warm_start=False, wealth=0.5)

In [30]:
model.coef_


Out[30]:
array([  46.80116426,   30.22180346,    8.92193156,   -1.54397191,
         -6.72566221,  -22.1079264 ,  -25.00232716,   33.84285931,
        -11.25278868,    1.29121497,  -11.76718961,   -2.58871565,
        -18.86362968,   57.33669064,  -14.55793156,  110.3904802 ,
         16.75388767,   18.0352998 ,    1.31859676,   -2.70571059,
         -7.44179881,   70.64134538,    7.09748605,   24.82295686,
         15.90355362,   -2.18482399,   -3.32863643,  -24.20277326,
         -2.41955375,  -22.8086518 ,   14.6865101 ,   -7.89383798,
         -0.63908353,    0.62317447,   -3.88720708,   -2.34831322,
         18.97337076,  -16.33612658,   -1.80662728,   24.73371372,
         18.29242505,   16.40639552,  -16.24767692,   -8.88079219,
         -2.00275109,   13.429332  ,   -6.96642654,   10.576751  ,
         18.45957113,  -19.65023762])

In [31]:
model.coef_.shape


Out[31]:
(50,)

In [32]:
X.shape


Out[32]:
(100, 100)

In [33]:
model.partial_fit(pdf, y)


Out[33]:
AlphaInvestRegressor(alpha=0.0001, average=False, delta_alpha=0.5,
           epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
           learning_rate='invscaling', loss='squared_loss', max_iter=1000,
           n_iter=None, penalty='l1', power_t=0.25, random_state=None,
           shuffle=True, tol=None, verbose=0, warm_start=False,
           wealth=0.7103149107067736)

In [34]:
model.coef_.shape


Out[34]:
(51,)

In [35]:
model.coef_


Out[35]:
array([  4.67896922e+01,   3.02272853e+01,   8.90626055e+00,
        -1.54061590e+00,  -6.72040787e+00,  -2.21129288e+01,
        -2.50035643e+01,   3.38376218e+01,  -1.12707241e+01,
         1.29423638e+00,  -1.17811925e+01,  -2.59833144e+00,
        -1.88641435e+01,   5.73285368e+01,  -1.45560541e+01,
         1.10391527e+02,   1.67527701e+01,   1.80278239e+01,
         1.32013931e+00,  -2.69936744e+00,  -7.43945023e+00,
         7.06415770e+01,   7.06807229e+00,   2.48184761e+01,
         1.59116689e+01,  -2.18680114e+00,  -3.33468959e+00,
        -2.42093178e+01,  -2.42467479e+00,  -2.28082276e+01,
         1.46802146e+01,  -7.91276639e+00,  -6.48383876e-01,
         6.40256150e-01,  -3.88636394e+00,  -2.35490508e+00,
         1.89935343e+01,  -1.63377455e+01,  -1.79376142e+00,
         2.47343089e+01,   1.82933949e+01,   1.64018750e+01,
        -1.62512255e+01,  -8.86383490e+00,  -1.99239350e+00,
         1.34093126e+01,  -6.97392235e+00,   1.05741610e+01,
         1.84605058e+01,  -1.96588090e+01,   1.27462781e-02])

In [36]:
model.predict(pdf)


(100, 51)
Out[36]:
array([  4.68612222e+01,  -4.07455447e+01,   1.48496224e+02,
         3.07986420e+01,  -2.85692377e+01,  -5.45946063e+01,
         3.60263148e+00,   2.07636520e+02,   2.52974978e+02,
         3.95064978e+01,  -6.76216008e+00,   6.72505149e+01,
         3.30062737e+01,   2.23969775e+02,   8.34394662e+01,
        -1.23587417e+02,   1.16477231e+02,   1.53451451e+02,
        -1.11756051e+02,  -1.44395933e+02,   5.17081741e+01,
        -2.35614101e+02,   5.34946999e+01,  -7.38528402e+01,
         1.87874390e+02,   2.01112980e+02,  -2.91409946e+02,
         4.68312433e+00,  -6.59421636e+00,   6.51708465e+00,
        -1.76563555e+02,   8.83536803e-02,   2.69641701e+02,
         2.55606134e+02,  -1.19354851e+02,   9.08871234e+01,
        -8.82455021e+01,   1.00221288e+02,   1.00705506e+02,
         2.41440099e+02,  -2.54410156e+01,   1.78269591e+02,
        -4.78966609e+01,  -1.64851540e+02,   1.28495929e+01,
         2.84052076e+01,  -1.58606244e+02,  -5.54148434e+00,
         1.94195484e+02,   1.52902866e+02,  -8.63510252e+01,
        -1.30777332e+02,   2.46922135e+01,  -1.39659794e+02,
        -1.87827983e+02,   1.84951911e+02,  -6.36325233e+01,
        -1.87175753e+02,   3.84549044e+01,  -2.43396384e+02,
        -2.18757051e+02,   1.24505472e+02,  -1.34633441e+01,
        -6.62694024e+01,   1.58086496e+02,   1.00909466e+02,
        -2.80948544e+02,   1.25089618e+00,   1.65847951e+01,
         3.58862250e+01,  -1.20247483e+02,   3.08142043e+02,
         2.38260600e+02,  -1.99617232e+02,   1.82314771e+02,
        -7.76491492e+01,  -1.37672179e+02,  -1.22523016e+01,
         2.36386008e+02,   8.37443994e+01,  -2.68192063e+01,
         2.11538051e+02,   1.27981765e+02,   1.17364121e+02,
        -1.91989523e+02,   3.16258772e+02,  -7.63235915e-01,
         3.91201381e+01,   4.24933921e+01,   8.52656621e+01,
        -4.32046602e+01,   1.47962108e+02,  -1.86105574e+02,
        -1.39721963e+02,   1.09363601e+02,   6.39915999e+01,
        -4.78373800e+01,   2.52855037e+02,  -1.08398944e+02,
        -1.26283158e+02])