when sampling from k-dpp

We shall sample until one of the following is met:

For speed we will demo the Nystroem kernel

Based on the following notebook: https://github.com/chappers/Context-driven-constraints-for-gradient-boosted-models/blob/master/autoML/streaming/dpp-groupfs.ipynb


In [1]:
import sklearn

In [2]:
from sklearn.datasets import make_regression, make_classification
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances

import pandas as pd
import numpy as np

from scipy import stats
from scipy.stats import wilcoxon
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem
from dpp import sample_dpp, decompose_kernel, sample_conditional_dpp

In [3]:
wilcoxon(np.random.normal(size=(100,)), np.random.normal(size=(100,))).pvalue


Out[3]:
0.31538302187180656

In [47]:
X, y = make_classification(n_features=50)
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(X.shape[1])]

In [48]:
X.shape


Out[48]:
(100, 50)

In [49]:
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-49-336de015d5e8> in <module>()
----> 1 X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
      2 X2 = pdf[['c{}'.format(x) for x in range(50)]]

c:\users\chapm\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   1956         if isinstance(key, (Series, np.ndarray, Index, list)):
   1957             # either boolean or fancy integer index
-> 1958             return self._getitem_array(key)
   1959         elif isinstance(key, DataFrame):
   1960             return self._getitem_frame(key)

c:\users\chapm\anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
   2000             return self.take(indexer, axis=0, convert=False)
   2001         else:
-> 2002             indexer = self.loc._convert_to_indexer(key, axis=1)
   2003             return self.take(indexer, axis=1, convert=True)
   2004 

c:\users\chapm\anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1229                 mask = check == -1
   1230                 if mask.any():
-> 1231                     raise KeyError('%s not in index' % objarr[mask])
   1232 
   1233                 return _values_from_object(indexer)

KeyError: "['c50' 'c51' 'c52' 'c53' 'c54' 'c55' 'c56' 'c57' 'c58' 'c59' 'c60' 'c61'\n 'c62' 'c63' 'c64' 'c65' 'c66' 'c67' 'c68' 'c69' 'c70' 'c71' 'c72' 'c73'\n 'c74' 'c75' 'c76' 'c77' 'c78' 'c79' 'c80' 'c81' 'c82' 'c83' 'c84' 'c85'\n 'c86' 'c87' 'c88' 'c89' 'c90' 'c91' 'c92' 'c93' 'c94' 'c95' 'c96' 'c97'\n 'c98' 'c99'] not in index"

In [7]:
[idx for idx, x in enumerate(pdf.columns) if x in ['c0', 'c13']]


Out[7]:
[0, 13]

In [50]:
from collections import Counter
import functools
a = Counter(y)

In [51]:
a.keys()


Out[51]:
dict_keys([0, 1])

In [75]:
X.shape


Out[75]:
(100, 50)

In [85]:
np.mean(X, axis=0).reshape(-1, 1).shape


Out[85]:
(50, 1)

In [116]:
def class_separability(X, y):
    """
    Calculates the class separability based on the mitra paper    
    """    
    # get prior probs
    prior_proba = Counter(y)
    
    s_w = []    
    for class_ in prior_proba.keys():
        mask = y==class_
        X_sel = X[mask, :]
        cov_sig = np.cov(X_sel.T)
        s_w.append(cov_sig * prior_proba[class_])
    s_w = np.add(*s_w)
    
    s_b = []
    m_o = np.mean(X, axis=0).reshape(-1, 1)
    for class_ in prior_proba.keys():
        mu_m = prior_proba[class_] - m_o
        s_b.append(np.dot(mu_m, mu_m.T))
    s_b = np.add(*s_b)
    s_b_inv = np.linalg.inv(s_b)
    S = np.trace(np.dot(s_b_inv, s_w)) # mitra
    S_ = np.trace(s_w)/np.trace(s_b)   # OGFS (criterion1) 
    # return result for each weight...
    S_diag = np.diag(s_w)/np.diag(s_b) # OGFS (criterion2)
    return s_w, s_b, S, S_, S_diag

In [117]:
aa, b, ss, ss1, ss2 = class_separability(X, y) # lower is better

In [118]:
ss


Out[118]:
-525155697915243.56

In [119]:
ss1


Out[119]:
0.020636408117874133

In [122]:
ss2


Out[122]:
array([ 0.0240139 ,  0.02007249,  0.02495617,  0.02661666,  0.00870059,
        0.02273024,  0.01961968,  0.01862494,  0.01766042,  0.02102656,
        0.02273644,  0.01569287,  0.0322513 ,  0.01712509,  0.02149026,
        0.02604032,  0.01953098,  0.02339917,  0.01672218,  0.02250925,
        0.02576408,  0.01876138,  0.01963613,  0.01890476,  0.01844898,
        0.0177503 ,  0.02238352,  0.02492517,  0.01461841,  0.01847299,
        0.02157517,  0.02165669,  0.0178774 ,  0.01832115,  0.01606219,
        0.02343537,  0.02064203,  0.024978  ,  0.02166904,  0.02308293,
        0.02080895,  0.02528485,  0.02105033,  0.01950124,  0.02492153,
        0.01845239,  0.01952898,  0.01927632,  0.01691564,  0.01555949])

In [8]:
def entropy(X):
    mm = MinMaxScaler()
    X_mm = mm.fit_transform(X)
    Dpq = euclidean_distances(X_mm)
    D_bar = np.mean([x for x in np.triu(Dpq).flatten() if x != 0])
    alpha = -np.log(0.5)/D_bar
    sim_pq = np.exp(-alpha * Dpq)
    log_sim_pq = np.log(sim_pq)
    entropy = -2*np.sum(np.triu(sim_pq*log_sim_pq + ((1-sim_pq)*np.log((1-sim_pq))), 1))
    return entropy

In [9]:
def wilcoxon_group(X, f):
    # X is a matrix, f is a single vector
    if len(X.shape) == 1:
        return wilcoxon(X, f).pvalue
    # now we shall perform and check each one...and return only the lowest pvalue
    return np.min([wilcoxon(x, f) for x in X.T])

In [10]:
"""
Implement DPP version that is similar to what is done above


sketch of solution
------------------

DPP requires a known number of parameters to check at each partial fit!


"""

class DPPClassifier(SGDClassifier):
    def __init__(self, loss="log", penalty='l2', alpha=0.0001, l1_ratio=0.15,
                 fit_intercept=True, max_iter=None, tol=None, shuffle=True,
                 verbose=0, epsilon=0.1, n_jobs=1,
                 random_state=None, learning_rate="optimal", eta0=0.0,
                 power_t=0.5, class_weight=None, warm_start=False,
                 average=False, n_iter=None, 
                 intragroup_alpha=0.05, intergroup_thres=None):
        super(DPPClassifier, self).__init__(
            loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio,
            fit_intercept=fit_intercept, max_iter=max_iter, tol=tol,
            shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs,
            random_state=random_state, learning_rate=learning_rate, eta0=eta0,
            power_t=power_t, class_weight=class_weight, warm_start=warm_start,
            average=average, n_iter=n_iter)
        self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
        self.seen_cols = []
        self.base_shape = None
        self.intragroup_alpha = intragroup_alpha
        self.intergroup_thres = intergroup_thres if intergroup_thres is not None else epsilon
    
    def _dpp_estimate_k(self, L):
        """
        L is the input kernel
        """
        pca = PCA(n_components=None)
        pca.fit(L)
        return np.min(np.argwhere(np.cumsum(pca.explained_variance_ratio_) > 
                                  (1-self.intragroup_alpha)))
        
    
    def add_column_exclusion(self, cols):
        self.coef_info['excluded_cols'] = list(self.coef_info['excluded_cols']) + list(cols)
        
    def _fit_columns(self, X_, return_x=True, transform_only=False):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        X = X_[X_.columns.difference(self.coef_info['excluded_cols'])]
        
        # order the columns correctly...
        col_order = self.coef_info['cols'] + list([x for x in X.columns if x not in self.coef_info['cols']])
        X = X[col_order]
        return X

    def _reg_penalty(self, X):
        col_coef = [(col, coef) for col, coef in zip(X.columns.tolist(), self.coef_.flatten()) if np.abs(coef) >= self.intergroup_thres]
        self.coef_info['cols'] = [x for x, _ in col_coef]
        self.coef_info['coef'] = [x for _, x in col_coef]
        self.coef_info['excluded_cols'] = [x for x in self.seen_cols if x not in self.coef_info['cols']]
        self.coef_ = np.array(self.coef_info['coef']).reshape(1, -1) 
    
    def _dpp_sel(self, X_, y=None):
        """
        DPP only relies on X. 
        
        We will condition the sampling based on:
        *  `self.coef_info['cols']`
        
        After sampling it will go ahead and then perform grouped wilcoxon selection.
        """
        X = np.array(X_)
        if X.shape[0] < 1000:
            feat_dist = rbf_kernel(X.T)
        else:
            feat_dist = Nystroem().fit_transform(X.T)
        k = self._dpp_estimate_k(feat_dist) - len(self.coef_info['cols'])
                
        if len(self.coef_info['cols']) == 0:
            feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
        else:
            cols_to_index = [idx for idx, x in enumerate(X_.columns) if x in self.coef_info['cols']]
            feat_index = sample_conditional_dpp(feat_dist, cols_to_index, k=k)
        
        # select features using entropy measure
        # how can we order features from most to least relevant first?
        # we chould do it using f test? Or otherwise - presume DPP selects best one first
        """
        feat_entropy = []
        excl_entropy = []
        X_sel = X[:, feat_index]
        
        for idx, feat in enumerate(X_sel.T):
            if len(feat_entropy) == 0:
                feat_entropy.append(idx)
                continue
            if entropy(X_sel[:, feat_entropy]) > entropy(X_sel[:, feat_entropy+[idx]]):
                feat_entropy.append(idx)
            else:
                excl_entropy.append(idx)
        """
        # iterate over feat_index to determine 
        # information on wilcoxon test
        # as the feat index are already "ordered" as that is how DPP would
        # perform the sampling - we will do the single pass in the same
        # way it was approached in the OGFS
        feat_check = []
        excl_check = []
        X_sel = X[:, feat_index]
        
        for idx, feat in enumerate(X_sel.T):
            if len(feat_check) == 0:
                feat_check.append(idx)
                continue
            if wilcoxon_group(X_sel[:, feat_check], feat) >= self.intragroup_alpha:
                feat_check.append(idx)
            else:
                excl_check.append(idx)
        index_to_col = [col for idx, col in enumerate(X_.columns) if idx in feat_check]
        self.coef_info['cols'] = list(set(self.coef_info['cols'] + index_to_col))
        col_rem = X_.columns.difference(self.coef_info['cols'])
        self.add_column_exclusion(col_rem)        
        
    def fit(self, X, y, coef_init=None, intercept_init=None,
            sample_weight=None):
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        
        # TODO: add DPP selection
        self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
        self._dpp_sel(X, y)
        X = self._fit_columns(X)
        
        super(DPPClassifier, self).fit(X, y, coef_init=coef_init, intercept_init=intercept_init,
            sample_weight=sample_weight)
        self._reg_penalty(X)
        return self
    
    def partial_fit(self, X, y, sample_weight=None):
        X_ = X.copy()
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        X = X[X.columns.difference(self.coef_info['excluded_cols'])]
        
        # TODO: add DPP selection
        self._dpp_sel(X, y)
        X = self._fit_columns(X_)
        
        # now update coefficients
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:len(self.coef_info['coef'])] = self.coef_info['coef']
        self.coef_ = np.array(coef_list).reshape(1, -1)
        
        super(DPPClassifier, self).partial_fit(X, y, sample_weight=None)  
        self._reg_penalty(X)
        return self
    
    def predict(self, X):
        X = self._fit_columns(X, transform_only=True)
        return super(DPPClassifier, self).predict(X)

In [11]:
model = DPPClassifier(max_iter=1000)
model.fit(X1, y)


Out[11]:
DPPClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, intergroup_thres=0.1,
       intragroup_alpha=0.05, l1_ratio=0.15, learning_rate='optimal',
       loss='log', max_iter=1000, n_iter=None, n_jobs=1, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [12]:
model.coef_.shape


Out[12]:
(1, 31)

In [13]:
model.partial_fit(pdf, y)


c:\users\chapm\anaconda3\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
c:\users\chapm\anaconda3\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
c:\users\chapm\anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
c:\users\chapm\anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
c:\users\chapm\anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[13]:
DPPClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, intergroup_thres=0.1,
       intragroup_alpha=0.05, l1_ratio=0.15, learning_rate='optimal',
       loss='log', max_iter=1000, n_iter=None, n_jobs=1, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [14]:
model.coef_.shape


Out[14]:
(1, 50)

In [15]:
model.predict(pdf)


Out[15]:
array([1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0])