In [1]:
from sklearn.datasets import make_classification
from itertools import chain

import numpy as np

from sklearn.metrics.pairwise import rbf_kernel
from dpp import sample_dpp, decompose_kernel, sample_conditional_dpp
#from dpp_classifier_supervised import class_separability, evaluate_feats
from ogfs_classifier import spec_supervised, evaluate_feats

In [2]:
import pandas as pd

In [3]:
def get_config_indices(dataset_config):
    """
    Takes in the configuration used to create dataset configuration
    and then returns the indices with relevant information.
    
    Order of creation is:
    1.  first n_informative is informative
    2.  next n_redundant are redundant
    3.  next n_repeated are repeated features (either redundant or repeated)
    4.  The rest of the features are useless
    """
    config = {}
    config['n_informative'] = list(range(dataset_config['n_informative']))
    config['n_redundant'] = list(range(dataset_config['n_informative'], 
        dataset_config['n_informative'] + dataset_config['n_redundant']))
    config['n_repeated'] = list(range(dataset_config['n_informative'] + dataset_config['n_redundant'], 
        dataset_config['n_informative'] + dataset_config['n_redundant'] + dataset_config['n_repeated']))
    return config

def salient_info(sel_feats, dataset_config):
    n_f = float(dataset_config['n_features'])
    n_r = float(dataset_config['n_informative'])
    
    m_feats = get_config_indices(dataset_config)
    m_feat_vals = list(chain.from_iterable(get_config_indices(dataset_config).values()))
    
    # determine n_g
    n_g = 0.0
    
    n_info_sel = len([feat for feat in m_feats['n_informative'] if feat in sel_feats])
    n_red_sel = len([feat for feat in m_feats['n_redundant'] if feat in sel_feats])
    n_rep_sel = len([feat for feat in m_feats['n_repeated'] if feat in sel_feats])
    
    if n_red_sel == 0:
        n_g = min(n_info_sel+n_rep_sel, n_r)
    else:
        n_g = min(n_info_sel+n_rep_sel + (n_red_sel/2.0), n_r)
    return max(min(1.0, (n_g/n_r) + (n_g/n_f) - 1.0), -1.0), n_g/n_r, n_g/len(sel_feats), len(sel_feats)

In [4]:
def wilcoxon_group(X, f):
    """
    Wilcoxon is a very aggressive selector in an unsupervised sense. 
    Do we require a supervised group selection? (probably)
    
    Probably one that is score based in order to select the "best" ones
    similar to OGFS?
    """
    from scipy.stats import wilcoxon
    # X is a matrix, f is a single vector
    if len(X.shape) == 1:
        return wilcoxon(X, f).pvalue
    # now we shall perform and check each one...and return only the lowest pvalue
    return np.max([wilcoxon(x, f).pvalue for x in X.T])

In [5]:
def wilcoxon_feats(X, s_b, s_w, alpha=0.1):
    """
    Returns subset of features based on wilcoxon feats
    """
    s_ord = s_b/s_w
    eval_order = np.argsort(s_ord)
    eval_order = eval_order[::-1]
    selected_idx = []
    for idx in eval_order:
        if len(selected_idx) == 0:
            selected_idx.append(idx)
            continue
        wilcoxon_pval = wilcoxon_group(X[:, selected_idx], X[:, idx].flatten())
        if wilcoxon_pval < alpha:
            selected_idx.append(idx)
    return selected_idx

In [6]:
def dpp_sampler(X, y, k=None, mode='dpp'):
    """
    Takes in dataset and return 
    set of features based on:
    
    only dpp sampling...will extend for supervised/unsupervised
    criterion
    """
    feat_dist = rbf_kernel(X.T)
    feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
    
    X_sel = X[:, feat_index]
    s_b, s_w = spec_supervised(X_sel, y)
    col_sel = evaluate_feats(s_b, s_w)
    ul_sel = wilcoxon_feats(X_sel, s_b, s_w, 0.1)
    
    if mode in ['supervised']:
        feat_index = list(np.array(feat_index)[col_sel]) + list(np.array(feat_index)[ul_sel])
        feat_index = list(set(feat_index))
    
    if mode in ['unsupervised']:
        feat_index = list(np.array(feat_index)[ul_sel])
    return feat_index

In [7]:
def dpp_sampler_all(X, y, k=None):
    """
    Takes in dataset and return 
    set of features based on:
    
    only dpp sampling...will extend for supervised/unsupervised
    criterion
    """
    all_info = {}
    feat_dist = rbf_kernel(X.T)
    feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
    all_info['dpp'] = feat_index[:]
    
    X_sel = X[:, feat_index]
    s_b, s_w = spec_supervised(X_sel, y)
    col_sel = evaluate_feats(s_b, s_w)
    ul_sel = wilcoxon_feats(X_sel, s_b, s_w, 0.05)
    
    feat_index = list(np.array(all_info['dpp'][:])[col_sel]) + list(np.array(all_info['dpp'][:])[ul_sel])
    feat_index = list(set(feat_index))
    all_info['sl'] = feat_index[:]
    
    feat_index = list(np.array(all_info['dpp'][:])[ul_sel])
    all_info['ul'] = feat_index[:]
    return all_info

In [8]:
dataset_config = {
    'n_features': 20, 
    'n_informative': 10,
    'n_redundant': 2,
    'n_repeated': 2
}
X, y = make_classification(random_state=0, shuffle=False,
**dataset_config)

In [9]:
def single_run(dataset_config, k=None):
    X, y = make_classification(random_state=0, shuffle=False,
**dataset_config)
    feats = None
    while feats is None:
        feats = dpp_sampler_all(X, y, k=k)
    metrics = {}
    metrics['dpp'] = salient_info(feats['dpp'], dataset_config)
    metrics['sl'] = salient_info(feats['sl'], dataset_config)
    metrics['ul'] = salient_info(feats['ul'], dataset_config)
    
    # make metrics a dataframe...
    df_info = [
        {'algorithm': 'dpp', 'saliency': metrics['dpp'][0], 'inform_perc': metrics['dpp'][1], 'ratio': metrics['dpp'][2], 'compact': metrics['dpp'][3]},
        {'algorithm': 'sl', 'saliency': metrics['sl'][0], 'inform_perc': metrics['sl'][1], 'ratio': metrics['sl'][2], 'compact': metrics['sl'][3]},
        {'algorithm': 'ul', 'saliency': metrics['ul'][0], 'inform_perc': metrics['ul'][1], 'ratio': metrics['ul'][2], 'compact': metrics['ul'][3]}
    ]
    return pd.DataFrame(df_info)

In [10]:
dataset_config = {
    'n_features': 20, 
    'n_informative': 10,
    'n_redundant': 2,
    'n_repeated': 2
}

In [11]:
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[11]:
algorithm compact inform_perc ratio saliency
0 dpp 8.8 0.580 0.650795 -0.1300
1 sl 5.7 0.395 0.705476 -0.4075
2 ul 3.0 0.190 0.633333 -0.7150

In [12]:
dataset_config = {
    'n_features': 200, 
    'n_informative': 10,
    'n_redundant': 2,
    'n_repeated': 2
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[12]:
algorithm compact inform_perc ratio saliency
0 dpp 89.4 0.600 0.067504 -0.37000
1 sl 21.7 0.265 0.409242 -0.72175
2 ul 2.9 0.160 0.550000 -0.83200

In [13]:
dataset_config = {
    'n_features': 200, 
    'n_informative': 20,
    'n_redundant': 5,
    'n_repeated': 5
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[13]:
algorithm compact inform_perc ratio saliency
0 dpp 88.6 0.5600 0.125780 -0.38400
1 sl 55.6 0.4175 0.384050 -0.54075
2 ul 2.2 0.0925 0.858333 -0.89825

In [14]:
dataset_config = {
    'n_features': 200, 
    'n_informative': 100,
    'n_redundant': 10,
    'n_repeated': 10
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[14]:
algorithm compact inform_perc ratio saliency
0 dpp 96.8 0.4645 0.481640 -0.30325
1 sl 14.4 0.0770 0.928571 -0.88450
2 ul 1.9 0.0190 1.000000 -0.97150

In [15]:
dataset_config = {
    'n_features': 200, 
    'n_informative': 50,
    'n_redundant': 10,
    'n_repeated': 10
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[15]:
algorithm compact inform_perc ratio saliency
0 dpp 87.5 0.473 0.271185 -0.40875
1 sl 19.9 0.139 0.678467 -0.82625
2 ul 2.9 0.044 0.790833 -0.94500

In [16]:
dataset_config = {
    'n_features': 40, 
    'n_informative': 20,
    'n_redundant': 0,
    'n_repeated': 10, 
    'n_clusters_per_class': 4
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[16]:
algorithm compact inform_perc ratio saliency
0 dpp 16.3 0.54 0.660231 -0.190
1 sl 8.6 0.31 0.782710 -0.535
2 ul 1.8 0.08 0.900000 -0.880

In [17]:
dataset_config = {
    'n_features': 200, 
    'n_informative': 20,
    'n_redundant': 0,
    'n_repeated': 5, 
    'n_clusters_per_class': 4
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[17]:
algorithm compact inform_perc ratio saliency
0 dpp 86.6 0.485 0.111945 -0.4665
1 sl 54.8 0.340 0.320470 -0.6260
2 ul 1.8 0.070 0.800000 -0.9230

Mandelon


In [18]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 5,
    'n_redundant': 15,
    'n_repeated': 0, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[18]:
algorithm compact inform_perc ratio saliency
0 dpp 214.4 0.93 0.021708 -0.0607
1 sl 25.7 0.29 0.338046 -0.7071
2 ul 1.3 0.11 0.550000 -0.8889

In [19]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 25,
    'n_redundant': 15,
    'n_repeated': 2, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[19]:
algorithm compact inform_perc ratio saliency
0 dpp 217.0 0.534 0.061414 -0.4393
1 sl 88.9 0.232 0.120928 -0.7564
2 ul 2.6 0.022 0.208333 -0.9769

In [20]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 50,
    'n_redundant': 15,
    'n_repeated': 4, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[20]:
algorithm compact inform_perc ratio saliency
0 dpp 215.0 0.476 0.110674 -0.4764
1 sl 86.9 0.221 0.303620 -0.7569
2 ul 2.2 0.026 0.608333 -0.9714

In [21]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 75,
    'n_redundant': 15,
    'n_repeated': 6, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[21]:
algorithm compact inform_perc ratio saliency
0 dpp 223.6 0.479333 0.161001 -0.448767
1 sl 180.1 0.382000 0.173122 -0.560700
2 ul 3.1 0.018000 0.470833 -0.979300

In [22]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 100,
    'n_redundant': 15,
    'n_repeated': 8, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[22]:
algorithm compact inform_perc ratio saliency
0 dpp 213.7 0.4585 0.213980 -0.4498
1 sl 66.4 0.1445 0.314841 -0.8266
2 ul 3.0 0.0135 0.429167 -0.9838

In [23]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 125,
    'n_redundant': 15,
    'n_repeated': 10, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[23]:
algorithm compact inform_perc ratio saliency
0 dpp 225.0 0.4704 0.261204 -0.4120
1 sl 160.3 0.3396 0.324031 -0.5755
2 ul 2.3 0.0124 0.650000 -0.9845

In [24]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 150,
    'n_redundant': 15,
    'n_repeated': 12, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[24]:
algorithm compact inform_perc ratio saliency
0 dpp 217.6 0.435333 0.300184 -0.434067
1 sl 113.2 0.225000 0.335526 -0.707500
2 ul 3.1 0.009667 0.487500 -0.987433

In [25]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 200,
    'n_redundant': 15,
    'n_repeated': 15, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[25]:
algorithm compact inform_perc ratio saliency
0 dpp 229.5 0.4680 0.407878 -0.3448
1 sl 114.8 0.2320 0.599519 -0.6752
2 ul 2.5 0.0095 0.816667 -0.9867

In [26]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
    'n_features': 500, 
    'n_informative': 250,
    'n_redundant': 15,
    'n_repeated': 15, 
    'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()


Out[26]:
algorithm compact inform_perc ratio saliency
0 dpp 235.1 0.4404 0.468434 -0.3394
1 sl 49.3 0.0956 0.526225 -0.8566
2 ul 3.0 0.0064 0.550000 -0.9904

In [ ]: