In [1]:
from sklearn.datasets import make_classification
from itertools import chain
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from dpp import sample_dpp, decompose_kernel, sample_conditional_dpp
#from dpp_classifier_supervised import class_separability, evaluate_feats
from ogfs_classifier import spec_supervised, evaluate_feats
In [2]:
import pandas as pd
In [3]:
def get_config_indices(dataset_config):
"""
Takes in the configuration used to create dataset configuration
and then returns the indices with relevant information.
Order of creation is:
1. first n_informative is informative
2. next n_redundant are redundant
3. next n_repeated are repeated features (either redundant or repeated)
4. The rest of the features are useless
"""
config = {}
config['n_informative'] = list(range(dataset_config['n_informative']))
config['n_redundant'] = list(range(dataset_config['n_informative'],
dataset_config['n_informative'] + dataset_config['n_redundant']))
config['n_repeated'] = list(range(dataset_config['n_informative'] + dataset_config['n_redundant'],
dataset_config['n_informative'] + dataset_config['n_redundant'] + dataset_config['n_repeated']))
return config
def salient_info(sel_feats, dataset_config):
n_f = float(dataset_config['n_features'])
n_r = float(dataset_config['n_informative'])
m_feats = get_config_indices(dataset_config)
m_feat_vals = list(chain.from_iterable(get_config_indices(dataset_config).values()))
# determine n_g
n_g = 0.0
n_info_sel = len([feat for feat in m_feats['n_informative'] if feat in sel_feats])
n_red_sel = len([feat for feat in m_feats['n_redundant'] if feat in sel_feats])
n_rep_sel = len([feat for feat in m_feats['n_repeated'] if feat in sel_feats])
if n_red_sel == 0:
n_g = min(n_info_sel+n_rep_sel, n_r)
else:
n_g = min(n_info_sel+n_rep_sel + (n_red_sel/2.0), n_r)
return max(min(1.0, (n_g/n_r) + (n_g/n_f) - 1.0), -1.0), n_g/n_r, n_g/len(sel_feats), len(sel_feats)
In [4]:
def wilcoxon_group(X, f):
"""
Wilcoxon is a very aggressive selector in an unsupervised sense.
Do we require a supervised group selection? (probably)
Probably one that is score based in order to select the "best" ones
similar to OGFS?
"""
from scipy.stats import wilcoxon
# X is a matrix, f is a single vector
if len(X.shape) == 1:
return wilcoxon(X, f).pvalue
# now we shall perform and check each one...and return only the lowest pvalue
return np.max([wilcoxon(x, f).pvalue for x in X.T])
In [5]:
def wilcoxon_feats(X, s_b, s_w, alpha=0.1):
"""
Returns subset of features based on wilcoxon feats
"""
s_ord = s_b/s_w
eval_order = np.argsort(s_ord)
eval_order = eval_order[::-1]
selected_idx = []
for idx in eval_order:
if len(selected_idx) == 0:
selected_idx.append(idx)
continue
wilcoxon_pval = wilcoxon_group(X[:, selected_idx], X[:, idx].flatten())
if wilcoxon_pval < alpha:
selected_idx.append(idx)
return selected_idx
In [6]:
def dpp_sampler(X, y, k=None, mode='dpp'):
"""
Takes in dataset and return
set of features based on:
only dpp sampling...will extend for supervised/unsupervised
criterion
"""
feat_dist = rbf_kernel(X.T)
feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
X_sel = X[:, feat_index]
s_b, s_w = spec_supervised(X_sel, y)
col_sel = evaluate_feats(s_b, s_w)
ul_sel = wilcoxon_feats(X_sel, s_b, s_w, 0.1)
if mode in ['supervised']:
feat_index = list(np.array(feat_index)[col_sel]) + list(np.array(feat_index)[ul_sel])
feat_index = list(set(feat_index))
if mode in ['unsupervised']:
feat_index = list(np.array(feat_index)[ul_sel])
return feat_index
In [7]:
def dpp_sampler_all(X, y, k=None):
"""
Takes in dataset and return
set of features based on:
only dpp sampling...will extend for supervised/unsupervised
criterion
"""
all_info = {}
feat_dist = rbf_kernel(X.T)
feat_index = sample_dpp(decompose_kernel(feat_dist), k=k)
all_info['dpp'] = feat_index[:]
X_sel = X[:, feat_index]
s_b, s_w = spec_supervised(X_sel, y)
col_sel = evaluate_feats(s_b, s_w)
ul_sel = wilcoxon_feats(X_sel, s_b, s_w, 0.05)
feat_index = list(np.array(all_info['dpp'][:])[col_sel]) + list(np.array(all_info['dpp'][:])[ul_sel])
feat_index = list(set(feat_index))
all_info['sl'] = feat_index[:]
feat_index = list(np.array(all_info['dpp'][:])[ul_sel])
all_info['ul'] = feat_index[:]
return all_info
In [8]:
dataset_config = {
'n_features': 20,
'n_informative': 10,
'n_redundant': 2,
'n_repeated': 2
}
X, y = make_classification(random_state=0, shuffle=False,
**dataset_config)
In [9]:
def single_run(dataset_config, k=None):
X, y = make_classification(random_state=0, shuffle=False,
**dataset_config)
feats = None
while feats is None:
feats = dpp_sampler_all(X, y, k=k)
metrics = {}
metrics['dpp'] = salient_info(feats['dpp'], dataset_config)
metrics['sl'] = salient_info(feats['sl'], dataset_config)
metrics['ul'] = salient_info(feats['ul'], dataset_config)
# make metrics a dataframe...
df_info = [
{'algorithm': 'dpp', 'saliency': metrics['dpp'][0], 'inform_perc': metrics['dpp'][1], 'ratio': metrics['dpp'][2], 'compact': metrics['dpp'][3]},
{'algorithm': 'sl', 'saliency': metrics['sl'][0], 'inform_perc': metrics['sl'][1], 'ratio': metrics['sl'][2], 'compact': metrics['sl'][3]},
{'algorithm': 'ul', 'saliency': metrics['ul'][0], 'inform_perc': metrics['ul'][1], 'ratio': metrics['ul'][2], 'compact': metrics['ul'][3]}
]
return pd.DataFrame(df_info)
In [10]:
dataset_config = {
'n_features': 20,
'n_informative': 10,
'n_redundant': 2,
'n_repeated': 2
}
In [11]:
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[11]:
In [12]:
dataset_config = {
'n_features': 200,
'n_informative': 10,
'n_redundant': 2,
'n_repeated': 2
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[12]:
In [13]:
dataset_config = {
'n_features': 200,
'n_informative': 20,
'n_redundant': 5,
'n_repeated': 5
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[13]:
In [14]:
dataset_config = {
'n_features': 200,
'n_informative': 100,
'n_redundant': 10,
'n_repeated': 10
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[14]:
In [15]:
dataset_config = {
'n_features': 200,
'n_informative': 50,
'n_redundant': 10,
'n_repeated': 10
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[15]:
In [16]:
dataset_config = {
'n_features': 40,
'n_informative': 20,
'n_redundant': 0,
'n_repeated': 10,
'n_clusters_per_class': 4
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[16]:
In [17]:
dataset_config = {
'n_features': 200,
'n_informative': 20,
'n_redundant': 0,
'n_repeated': 5,
'n_clusters_per_class': 4
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[17]:
In [18]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 5,
'n_redundant': 15,
'n_repeated': 0,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[18]:
In [19]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 25,
'n_redundant': 15,
'n_repeated': 2,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[19]:
In [20]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 50,
'n_redundant': 15,
'n_repeated': 4,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[20]:
In [21]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 75,
'n_redundant': 15,
'n_repeated': 6,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[21]:
In [22]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 100,
'n_redundant': 15,
'n_repeated': 8,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[22]:
In [23]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 125,
'n_redundant': 15,
'n_repeated': 10,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[23]:
In [24]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 150,
'n_redundant': 15,
'n_repeated': 12,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[24]:
In [25]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 200,
'n_redundant': 15,
'n_repeated': 15,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[25]:
In [26]:
# mandelon NIPS 2003 setup (see uci: http://archive.ics.uci.edu/ml/datasets/madelon)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
dataset_config = {
'n_features': 500,
'n_informative': 250,
'n_redundant': 15,
'n_repeated': 15,
'n_clusters_per_class': 16
}
feat_info = pd.concat([single_run(dataset_config) for x in range(10)])
feat_info.groupby('algorithm').mean().reset_index()
Out[26]:
In [ ]: