notebook.community

Edit and run



In [23]:

    
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import load_boston, load_iris, load_diabetes, make_classification, make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr

import eli5
from eli5.sklearn import PermutationImportance



In [19]:

    
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best



In [46]:

    
def get_classification_datasets():
    res = []

    data = load_iris()
    res.append(('iris_binary', data.data, data.target != 0, data.feature_names))
    
    X, y = make_classification(n_informative=5, n_redundant=0)
    res.append(('CLF(n_informative=5, n_redundant=0)', X, y, None))
    
    X, y = make_classification(n_informative=5, n_redundant=4)
    res.append(('CLF(n_informative=5, n_redundant=4)', X, y, None))

    X, y = make_classification(n_informative=1, n_redundant=4, n_clusters_per_class=1)
    res.append(('CLF(n_informative=1, n_redundant=4)', X, y, None))

    X, y = make_classification(n_informative=20, n_redundant=0)
    res.append(('CLF(n_informative=20, n_redundant=0)', X, y, None))

    return res


def get_regression_datasets():
    res = []
    
    data = load_boston()
    res.append(('boston', data.data, data.target, data.feature_names))
    
    data = load_diabetes()
    res.append(('diabetese', data.data, data.target, None))
    
    X, y = make_regression(n_informative=5)
    res.append(('REG(n_informative=5)', X, y, None))
    
    X, y = make_regression(n_informative=5, effective_rank=2)
    res.append(('REG(n_informative=5, effective_rank=2)', X, y, None))

    X, y = make_regression(n_informative=1)
    res.append(('REG(n_informative=1)', X, y, None))

    X, y = make_regression(n_informative=20)
    res.append(('REG(n_informative=20)', X, y, None))

    return res



In [47]:

    
def get_classifiers():
    return [
        LogisticRegression(),
        LinearSVC(),
        RandomForestClassifier(),
        DecisionTreeClassifier(),
    ]


def get_regressors():
    return [
        make_pipeline(StandardScaler(), LinearRegression()),
        make_pipeline(StandardScaler(), LinearSVR()),
        RandomForestRegressor(),
        DecisionTreeRegressor(),
    ]



In [48]:

    
def get_explanations(est, X, y, feature_names):
    df_inspect = eli5.explain_weights_df(est, feature_names=feature_names, top=100)
    if isinstance(df_inspect.index, pd.MultiIndex):
        df_inspect.index = df_inspect.index.droplevel()
    df_inspect.index.name = None
    
    pi = PermutationImportance(est, cv='prefit', n_iter=10).fit(X, y)
    df_pi = eli5.explain_weights_df(pi, feature_names=feature_names, top=100)
    
    pi_cv = PermutationImportance(est, cv=5, n_iter=10).fit(X, y)
    df_picv = eli5.explain_weights_df(pi_cv, feature_names=feature_names, top=100)
        
    df = pd.concat([df_inspect.weight, df_pi.weight, df_picv.weight], axis=1)
    df.columns=['w_inspect', 'w_pi', 'w_picv']
    df = df.dropna() / df.abs().sum()
    return df



In [65]:

    
def get_scores(df):
    w_inspect_abs = df.w_inspect.abs().values
    def _scores(column):
        return {
            'SpearmanR': spearmanr(w_inspect_abs, column.values).correlation,
            'NDCG': ndcg_score(w_inspect_abs, column.values, 100000),
            'NDCG@5': ndcg_score(w_inspect_abs, column.values, 5),
            'Pearson': pearsonr(w_inspect_abs, column.values)[0],
#             'R^2': r2_score(w_inspect_abs, column.values),
            'L2': np.linalg.norm(w_inspect_abs - column.values),
        }
    return {
        'PI': _scores(df.w_pi),
        'PICV': _scores(df.w_picv),
    }



In [66]:

    
def get_name(est):
    if isinstance(est, Pipeline):
        est = est.steps[-1][1]
    return est.__class__.__name__

dfs = []
estimators = {}
scores = []

def _append(X, y, feature_names, dataset_name, est):
    est.fit(X, y)
    df = get_explanations(est, X, y, feature_names)
    name = get_name(est)
    estimators[name, dataset_name] = est
    dfs.append((name, dataset_name, df))
    for k, v in get_scores(df).items():
        scores.append((name, dataset_name, k, v))
    print("done: {}  {}".format(name, dataset_name))
    

for (dataset_name, X, y, feature_names) in get_classification_datasets():
    for clf in get_classifiers():
        _append(X, y, feature_names, dataset_name, clf)
        
for (dataset_name, X, y, feature_names) in get_regression_datasets():
    for reg in get_regressors():
        _append(X, y, feature_names, dataset_name, reg)









    



done: LogisticRegression  iris_binary
done: LinearSVC  iris_binary
done: RandomForestClassifier  iris_binary
done: DecisionTreeClassifier  iris_binary
done: LogisticRegression  CLF(n_informative=5, n_redundant=0)
done: LinearSVC  CLF(n_informative=5, n_redundant=0)
done: RandomForestClassifier  CLF(n_informative=5, n_redundant=0)
done: DecisionTreeClassifier  CLF(n_informative=5, n_redundant=0)
done: LogisticRegression  CLF(n_informative=5, n_redundant=4)
done: LinearSVC  CLF(n_informative=5, n_redundant=4)
done: RandomForestClassifier  CLF(n_informative=5, n_redundant=4)
done: DecisionTreeClassifier  CLF(n_informative=5, n_redundant=4)
done: LogisticRegression  CLF(n_informative=1, n_redundant=4)
done: LinearSVC  CLF(n_informative=1, n_redundant=4)
done: RandomForestClassifier  CLF(n_informative=1, n_redundant=4)
done: DecisionTreeClassifier  CLF(n_informative=1, n_redundant=4)
done: LogisticRegression  CLF(n_informative=20, n_redundant=0)
done: LinearSVC  CLF(n_informative=20, n_redundant=0)
done: RandomForestClassifier  CLF(n_informative=20, n_redundant=0)
done: DecisionTreeClassifier  CLF(n_informative=20, n_redundant=0)
done: LinearRegression  boston
done: LinearSVR  boston
done: RandomForestRegressor  boston
done: DecisionTreeRegressor  boston
done: LinearRegression  diabetese
done: LinearSVR  diabetese
done: RandomForestRegressor  diabetese
done: DecisionTreeRegressor  diabetese
done: LinearRegression  REG(n_informative=5)
done: LinearSVR  REG(n_informative=5)
done: RandomForestRegressor  REG(n_informative=5)
done: DecisionTreeRegressor  REG(n_informative=5)
done: LinearRegression  REG(n_informative=5, effective_rank=2)
done: LinearSVR  REG(n_informative=5, effective_rank=2)
done: RandomForestRegressor  REG(n_informative=5, effective_rank=2)
done: DecisionTreeRegressor  REG(n_informative=5, effective_rank=2)
done: LinearRegression  REG(n_informative=1)
done: LinearSVR  REG(n_informative=1)
done: RandomForestRegressor  REG(n_informative=1)
done: DecisionTreeRegressor  REG(n_informative=1)
done: LinearRegression  REG(n_informative=20)
done: LinearSVR  REG(n_informative=20)
done: RandomForestRegressor  REG(n_informative=20)
done: DecisionTreeRegressor  REG(n_informative=20)



In [67]:

    
df = pd.DataFrame([s[3] for s in scores])
df = df.assign(
    estimator=[s[0] for s in scores],
    dataset=[s[1] for s in scores],
    type=[s[2] for s in scores],
)
df









    Out[67]:







  
    
      
      L2
      NDCG
      NDCG@5
      Pearson
      SpearmanR
      dataset
      estimator
      type
    
  
  
    
      0
      0.672614
      0.984817
      0.984817
      0.833270
      0.632456
      iris_binary
      LogisticRegression
      PI
    
    
      1
      0.672541
      0.984817
      0.984817
      0.833273
      0.632456
      iris_binary
      LogisticRegression
      PICV
    
    
      2
      0.675399
      1.000000
      1.000000
      0.872404
      0.948683
      iris_binary
      LinearSVC
      PI
    
    
      3
      0.671430
      0.999950
      0.999950
      0.873153
      0.632456
      iris_binary
      LinearSVC
      PICV
    
    
      4
      0.282843
      0.964335
      0.964335
      0.968496
      0.816497
      iris_binary
      RandomForestClassifier
      PI
    
    
      5
      0.109003
      1.000000
      1.000000
      0.993515
      1.000000
      iris_binary
      RandomForestClassifier
      PICV
    
    
      6
      0.000000
      1.000000
      1.000000
      1.000000
      1.000000
      iris_binary
      DecisionTreeClassifier
      PI
    
    
      7
      0.834689
      0.630930
      0.630930
      0.357553
      0.272166
      iris_binary
      DecisionTreeClassifier
      PICV
    
    
      8
      0.167495
      0.971622
      0.965187
      0.872985
      0.908544
      CLF(n_informative=5, n_redundant=0)
      LogisticRegression
      PI
    
    
      9
      0.255409
      0.973934
      0.981802
      0.844037
      0.820918
      CLF(n_informative=5, n_redundant=0)
      LogisticRegression
      PICV
    
    
      10
      0.123809
      0.969013
      0.932469
      0.893776
      0.934538
      CLF(n_informative=5, n_redundant=0)
      LinearSVC
      PI
    
    
      11
      0.252516
      0.914352
      0.784295
      0.622974
      0.684211
      CLF(n_informative=5, n_redundant=0)
      LinearSVC
      PICV
    
    
      12
      0.322749
      0.994469
      0.989704
      0.900801
      0.885049
      CLF(n_informative=5, n_redundant=0)
      RandomForestClassifier
      PI
    
    
      13
      0.277373
      0.946303
      0.900302
      0.769257
      0.374577
      CLF(n_informative=5, n_redundant=0)
      RandomForestClassifier
      PICV
    
    
      14
      0.072285
      0.993702
      0.970213
      0.981011
      0.973103
      CLF(n_informative=5, n_redundant=0)
      DecisionTreeClassifier
      PI
    
    
      15
      0.260483
      0.920732
      0.879966
      0.741018
      0.298858
      CLF(n_informative=5, n_redundant=0)
      DecisionTreeClassifier
      PICV
    
    
      16
      0.273521
      0.981474
      0.973298
      0.812503
      0.817674
      CLF(n_informative=5, n_redundant=4)
      LogisticRegression
      PI
    
    
      17
      0.264550
      0.946682
      0.940374
      0.756563
      0.520301
      CLF(n_informative=5, n_redundant=4)
      LogisticRegression
      PICV
    
    
      18
      0.250067
      0.966972
      0.913251
      0.768077
      0.706813
      CLF(n_informative=5, n_redundant=4)
      LinearSVC
      PI
    
    
      19
      0.304935
      0.957164
      0.938046
      0.744530
      0.711278
      CLF(n_informative=5, n_redundant=4)
      LinearSVC
      PICV
    
    
      20
      0.275022
      0.989233
      0.981340
      0.971646
      0.749610
      CLF(n_informative=5, n_redundant=4)
      RandomForestClassifier
      PI
    
    
      21
      0.263122
      0.955524
      0.922541
      0.930586
      0.383459
      CLF(n_informative=5, n_redundant=4)
      RandomForestClassifier
      PICV
    
    
      22
      0.138652
      0.998331
      0.998283
      0.976383
      0.997411
      CLF(n_informative=5, n_redundant=4)
      DecisionTreeClassifier
      PI
    
    
      23
      0.247536
      0.944404
      0.877370
      0.928444
      0.173572
      CLF(n_informative=5, n_redundant=4)
      DecisionTreeClassifier
      PICV
    
    
      24
      0.336659
      0.990104
      0.978667
      0.918085
      0.875421
      CLF(n_informative=1, n_redundant=4)
      LogisticRegression
      PI
    
    
      25
      0.401469
      0.963876
      0.954015
      0.893001
      0.221249
      CLF(n_informative=1, n_redundant=4)
      LogisticRegression
      PICV
    
    
      26
      0.332961
      0.992544
      0.973712
      0.937979
      0.845940
      CLF(n_informative=1, n_redundant=4)
      LinearSVC
      PI
    
    
      27
      0.341660
      0.971785
      0.975554
      0.895904
      0.352145
      CLF(n_informative=1, n_redundant=4)
      LinearSVC
      PICV
    
    
      28
      0.578752
      0.492292
      0.122312
      -0.288927
      0.191127
      CLF(n_informative=1, n_redundant=4)
      RandomForestClassifier
      PI
    
    
      29
      0.460242
      0.926089
      0.853204
      0.717342
      0.405126
      CLF(n_informative=1, n_redundant=4)
      RandomForestClassifier
      PICV
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      58
      0.235361
      0.991899
      1.000000
      0.927546
      0.737489
      REG(n_informative=5)
      LinearSVR
      PI
    
    
      59
      0.189558
      0.980900
      0.999842
      0.927359
      0.506976
      REG(n_informative=5)
      LinearSVR
      PICV
    
    
      60
      0.142030
      0.990276
      0.984540
      0.980935
      0.779370
      REG(n_informative=5)
      RandomForestRegressor
      PI
    
    
      61
      0.124729
      0.901260
      0.907927
      0.939584
      0.163636
      REG(n_informative=5)
      RandomForestRegressor
      PICV
    
    
      62
      0.107740
      0.988112
      0.977762
      0.975555
      0.999033
      REG(n_informative=5)
      DecisionTreeRegressor
      PI
    
    
      63
      0.179533
      0.911248
      0.942287
      0.883271
      -0.133398
      REG(n_informative=5)
      DecisionTreeRegressor
      PICV
    
    
      64
      0.367224
      0.999991
      1.000000
      0.967209
      0.999629
      REG(n_informative=5, effective_rank=2)
      LinearRegression
      PI
    
    
      65
      0.238332
      0.971812
      0.959122
      0.973212
      0.211095
      REG(n_informative=5, effective_rank=2)
      LinearRegression
      PICV
    
    
      66
      0.401022
      0.999889
      1.000000
      0.966992
      0.996933
      REG(n_informative=5, effective_rank=2)
      LinearSVR
      PI
    
    
      67
      0.284760
      0.980323
      0.978061
      0.977250
      0.509969
      REG(n_informative=5, effective_rank=2)
      LinearSVR
      PICV
    
    
      68
      0.258505
      0.995616
      0.995906
      0.985579
      0.833975
      REG(n_informative=5, effective_rank=2)
      RandomForestRegressor
      PI
    
    
      69
      0.297123
      0.943655
      0.944391
      0.974220
      0.068419
      REG(n_informative=5, effective_rank=2)
      RandomForestRegressor
      PICV
    
    
      70
      0.067249
      0.999453
      1.000000
      0.993935
      0.998769
      REG(n_informative=5, effective_rank=2)
      DecisionTreeRegressor
      PI
    
    
      71
      0.164108
      0.978136
      0.967044
      0.966069
      0.342775
      REG(n_informative=5, effective_rank=2)
      DecisionTreeRegressor
      PICV
    
    
      72
      0.452882
      0.999998
      1.000000
      0.998365
      0.999654
      REG(n_informative=1)
      LinearRegression
      PI
    
    
      73
      0.314697
      0.972598
      0.969714
      0.998146
      0.065281
      REG(n_informative=1)
      LinearRegression
      PICV
    
    
      74
      0.624122
      0.992905
      0.983652
      0.959778
      0.900569
      REG(n_informative=1)
      LinearSVR
      PI
    
    
      75
      0.516824
      0.979657
      0.973410
      0.962190
      0.541348
      REG(n_informative=1)
      LinearSVR
      PICV
    
    
      76
      0.017653
      0.999450
      0.998772
      0.999990
      0.610201
      REG(n_informative=1)
      RandomForestRegressor
      PI
    
    
      77
      0.014198
      0.997467
      0.997146
      0.999982
      0.060978
      REG(n_informative=1)
      RandomForestRegressor
      PICV
    
    
      78
      0.003633
      0.999976
      0.999910
      0.999997
      0.997796
      REG(n_informative=1)
      DecisionTreeRegressor
      PI
    
    
      79
      0.014089
      0.997671
      0.995892
      0.999954
      0.069229
      REG(n_informative=1)
      DecisionTreeRegressor
      PICV
    
    
      80
      0.096921
      0.999998
      1.000000
      0.968106
      0.999678
      REG(n_informative=20)
      LinearRegression
      PI
    
    
      81
      0.095981
      0.989010
      0.984276
      0.932558
      0.446691
      REG(n_informative=20)
      LinearRegression
      PICV
    
    
      82
      0.120204
      0.987079
      0.973107
      0.899272
      0.822808
      REG(n_informative=20)
      LinearSVR
      PI
    
    
      83
      0.125489
      0.987371
      0.973107
      0.904354
      0.797415
      REG(n_informative=20)
      LinearSVR
      PICV
    
    
      84
      0.108996
      0.992210
      0.994836
      0.972288
      0.876028
      REG(n_informative=20)
      RandomForestRegressor
      PI
    
    
      85
      0.207201
      0.780758
      0.687252
      0.502312
      0.156832
      REG(n_informative=20)
      RandomForestRegressor
      PICV
    
    
      86
      0.128705
      0.936889
      0.886772
      0.939217
      0.998556
      REG(n_informative=20)
      DecisionTreeRegressor
      PI
    
    
      87
      0.340442
      0.612293
      0.429809
      0.244711
      0.032248
      REG(n_informative=20)
      DecisionTreeRegressor
      PICV
    
  

88 rows × 8 columns



In [68]:

    
df_pi = df[df.type=="PI"]
df_pi









    Out[68]:







  
    
      
      L2
      NDCG
      NDCG@5
      Pearson
      SpearmanR
      dataset
      estimator
      type
    
  
  
    
      0
      0.672614
      0.984817
      0.984817
      0.833270
      0.632456
      iris_binary
      LogisticRegression
      PI
    
    
      2
      0.675399
      1.000000
      1.000000
      0.872404
      0.948683
      iris_binary
      LinearSVC
      PI
    
    
      4
      0.282843
      0.964335
      0.964335
      0.968496
      0.816497
      iris_binary
      RandomForestClassifier
      PI
    
    
      6
      0.000000
      1.000000
      1.000000
      1.000000
      1.000000
      iris_binary
      DecisionTreeClassifier
      PI
    
    
      8
      0.167495
      0.971622
      0.965187
      0.872985
      0.908544
      CLF(n_informative=5, n_redundant=0)
      LogisticRegression
      PI
    
    
      10
      0.123809
      0.969013
      0.932469
      0.893776
      0.934538
      CLF(n_informative=5, n_redundant=0)
      LinearSVC
      PI
    
    
      12
      0.322749
      0.994469
      0.989704
      0.900801
      0.885049
      CLF(n_informative=5, n_redundant=0)
      RandomForestClassifier
      PI
    
    
      14
      0.072285
      0.993702
      0.970213
      0.981011
      0.973103
      CLF(n_informative=5, n_redundant=0)
      DecisionTreeClassifier
      PI
    
    
      16
      0.273521
      0.981474
      0.973298
      0.812503
      0.817674
      CLF(n_informative=5, n_redundant=4)
      LogisticRegression
      PI
    
    
      18
      0.250067
      0.966972
      0.913251
      0.768077
      0.706813
      CLF(n_informative=5, n_redundant=4)
      LinearSVC
      PI
    
    
      20
      0.275022
      0.989233
      0.981340
      0.971646
      0.749610
      CLF(n_informative=5, n_redundant=4)
      RandomForestClassifier
      PI
    
    
      22
      0.138652
      0.998331
      0.998283
      0.976383
      0.997411
      CLF(n_informative=5, n_redundant=4)
      DecisionTreeClassifier
      PI
    
    
      24
      0.336659
      0.990104
      0.978667
      0.918085
      0.875421
      CLF(n_informative=1, n_redundant=4)
      LogisticRegression
      PI
    
    
      26
      0.332961
      0.992544
      0.973712
      0.937979
      0.845940
      CLF(n_informative=1, n_redundant=4)
      LinearSVC
      PI
    
    
      28
      0.578752
      0.492292
      0.122312
      -0.288927
      0.191127
      CLF(n_informative=1, n_redundant=4)
      RandomForestClassifier
      PI
    
    
      30
      0.015446
      1.000000
      1.000000
      0.999899
      0.999027
      CLF(n_informative=1, n_redundant=4)
      DecisionTreeClassifier
      PI
    
    
      32
      0.173657
      0.979462
      0.964900
      0.877614
      0.700077
      CLF(n_informative=20, n_redundant=0)
      LogisticRegression
      PI
    
    
      34
      0.252788
      0.973859
      0.968925
      0.768863
      0.580477
      CLF(n_informative=20, n_redundant=0)
      LinearSVC
      PI
    
    
      36
      0.140510
      0.938507
      0.768774
      0.672920
      0.654109
      CLF(n_informative=20, n_redundant=0)
      RandomForestClassifier
      PI
    
    
      38
      0.109475
      0.995163
      0.979534
      0.959566
      0.981116
      CLF(n_informative=20, n_redundant=0)
      DecisionTreeClassifier
      PI
    
    
      40
      0.242637
      0.999890
      0.999311
      0.952595
      0.989011
      boston
      LinearRegression
      PI
    
    
      42
      0.318675
      0.997746
      0.994291
      0.950851
      0.972527
      boston
      LinearSVR
      PI
    
    
      44
      0.034486
      0.999036
      0.999015
      0.997868
      0.989011
      boston
      RandomForestRegressor
      PI
    
    
      46
      0.177042
      0.999770
      0.999955
      0.973999
      0.967033
      boston
      DecisionTreeRegressor
      PI
    
    
      48
      0.292793
      1.000000
      1.000000
      0.967100
      1.000000
      diabetese
      LinearRegression
      PI
    
    
      50
      0.352574
      0.985625
      0.984779
      0.954882
      0.951515
      diabetese
      LinearSVR
      PI
    
    
      52
      0.057059
      0.999937
      1.000000
      0.991997
      0.951515
      diabetese
      RandomForestRegressor
      PI
    
    
      54
      0.120583
      0.927134
      0.916092
      0.929929
      0.915152
      diabetese
      DecisionTreeRegressor
      PI
    
    
      56
      0.159032
      0.999994
      1.000000
      0.983486
      0.999579
      REG(n_informative=5)
      LinearRegression
      PI
    
    
      58
      0.235361
      0.991899
      1.000000
      0.927546
      0.737489
      REG(n_informative=5)
      LinearSVR
      PI
    
    
      60
      0.142030
      0.990276
      0.984540
      0.980935
      0.779370
      REG(n_informative=5)
      RandomForestRegressor
      PI
    
    
      62
      0.107740
      0.988112
      0.977762
      0.975555
      0.999033
      REG(n_informative=5)
      DecisionTreeRegressor
      PI
    
    
      64
      0.367224
      0.999991
      1.000000
      0.967209
      0.999629
      REG(n_informative=5, effective_rank=2)
      LinearRegression
      PI
    
    
      66
      0.401022
      0.999889
      1.000000
      0.966992
      0.996933
      REG(n_informative=5, effective_rank=2)
      LinearSVR
      PI
    
    
      68
      0.258505
      0.995616
      0.995906
      0.985579
      0.833975
      REG(n_informative=5, effective_rank=2)
      RandomForestRegressor
      PI
    
    
      70
      0.067249
      0.999453
      1.000000
      0.993935
      0.998769
      REG(n_informative=5, effective_rank=2)
      DecisionTreeRegressor
      PI
    
    
      72
      0.452882
      0.999998
      1.000000
      0.998365
      0.999654
      REG(n_informative=1)
      LinearRegression
      PI
    
    
      74
      0.624122
      0.992905
      0.983652
      0.959778
      0.900569
      REG(n_informative=1)
      LinearSVR
      PI
    
    
      76
      0.017653
      0.999450
      0.998772
      0.999990
      0.610201
      REG(n_informative=1)
      RandomForestRegressor
      PI
    
    
      78
      0.003633
      0.999976
      0.999910
      0.999997
      0.997796
      REG(n_informative=1)
      DecisionTreeRegressor
      PI
    
    
      80
      0.096921
      0.999998
      1.000000
      0.968106
      0.999678
      REG(n_informative=20)
      LinearRegression
      PI
    
    
      82
      0.120204
      0.987079
      0.973107
      0.899272
      0.822808
      REG(n_informative=20)
      LinearSVR
      PI
    
    
      84
      0.108996
      0.992210
      0.994836
      0.972288
      0.876028
      REG(n_informative=20)
      RandomForestRegressor
      PI
    
    
      86
      0.128705
      0.936889
      0.886772
      0.939217
      0.998556
      REG(n_informative=20)
      DecisionTreeRegressor
      PI



In [71]:

    
df_pi.groupby('estimator').mean()









    Out[71]:







  
    
      
      L2
      NDCG
      NDCG@5
      Pearson
      SpearmanR
    
    
      estimator
      
      
      
      
      
    
  
  
    
      DecisionTreeClassifier
      0.067172
      0.997439
      0.989606
      0.983372
      0.990131
    
    
      DecisionTreeRegressor
      0.100825
      0.975222
      0.963415
      0.968772
      0.979390
    
    
      LinearRegression
      0.268582
      0.999978
      0.999885
      0.972810
      0.997925
    
    
      LinearSVC
      0.327005
      0.980478
      0.957671
      0.848220
      0.803290
    
    
      LinearSVR
      0.341993
      0.992524
      0.989305
      0.943220
      0.896974
    
    
      LogisticRegression
      0.324789
      0.981496
      0.973374
      0.862891
      0.786834
    
    
      RandomForestClassifier
      0.319975
      0.875767
      0.765293
      0.644987
      0.659278
    
    
      RandomForestRegressor
      0.103122
      0.996088
      0.995512
      0.988109
      0.840017



In [72]:

    
df_pi.groupby('dataset').mean()









    Out[72]:







  
    
      
      L2
      NDCG
      NDCG@5
      Pearson
      SpearmanR
    
    
      dataset
      
      
      
      
      
    
  
  
    
      CLF(n_informative=1, n_redundant=4)
      0.315954
      0.868735
      0.768673
      0.641759
      0.727879
    
    
      CLF(n_informative=20, n_redundant=0)
      0.169107
      0.971748
      0.920533
      0.819741
      0.728945
    
    
      CLF(n_informative=5, n_redundant=0)
      0.171585
      0.982201
      0.964393
      0.912143
      0.925309
    
    
      CLF(n_informative=5, n_redundant=4)
      0.234315
      0.984003
      0.966543
      0.882152
      0.817877
    
    
      REG(n_informative=1)
      0.274572
      0.998082
      0.995583
      0.989532
      0.877055
    
    
      REG(n_informative=20)
      0.113707
      0.979044
      0.963679
      0.944721
      0.924267
    
    
      REG(n_informative=5)
      0.161041
      0.992570
      0.990575
      0.966880
      0.878868
    
    
      REG(n_informative=5, effective_rank=2)
      0.273500
      0.998737
      0.998977
      0.978429
      0.957326
    
    
      boston
      0.193210
      0.999111
      0.998143
      0.968828
      0.979396
    
    
      diabetese
      0.205752
      0.978174
      0.975218
      0.960977
      0.954545
    
    
      iris_binary
      0.407714
      0.987288
      0.987288
      0.918542
      0.849409



In [ ]:

	L2	NDCG	NDCG@5	Pearson	SpearmanR	dataset	estimator	type
0	0.672614	0.984817	0.984817	0.833270	0.632456	iris_binary	LogisticRegression	PI
1	0.672541	0.984817	0.984817	0.833273	0.632456	iris_binary	LogisticRegression	PICV
2	0.675399	1.000000	1.000000	0.872404	0.948683	iris_binary	LinearSVC	PI
3	0.671430	0.999950	0.999950	0.873153	0.632456	iris_binary	LinearSVC	PICV
4	0.282843	0.964335	0.964335	0.968496	0.816497	iris_binary	RandomForestClassifier	PI
5	0.109003	1.000000	1.000000	0.993515	1.000000	iris_binary	RandomForestClassifier	PICV
6	0.000000	1.000000	1.000000	1.000000	1.000000	iris_binary	DecisionTreeClassifier	PI
7	0.834689	0.630930	0.630930	0.357553	0.272166	iris_binary	DecisionTreeClassifier	PICV
8	0.167495	0.971622	0.965187	0.872985	0.908544	CLF(n_informative=5, n_redundant=0)	LogisticRegression	PI
9	0.255409	0.973934	0.981802	0.844037	0.820918	CLF(n_informative=5, n_redundant=0)	LogisticRegression	PICV
10	0.123809	0.969013	0.932469	0.893776	0.934538	CLF(n_informative=5, n_redundant=0)	LinearSVC	PI
11	0.252516	0.914352	0.784295	0.622974	0.684211	CLF(n_informative=5, n_redundant=0)	LinearSVC	PICV
12	0.322749	0.994469	0.989704	0.900801	0.885049	CLF(n_informative=5, n_redundant=0)	RandomForestClassifier	PI
13	0.277373	0.946303	0.900302	0.769257	0.374577	CLF(n_informative=5, n_redundant=0)	RandomForestClassifier	PICV
14	0.072285	0.993702	0.970213	0.981011	0.973103	CLF(n_informative=5, n_redundant=0)	DecisionTreeClassifier	PI
15	0.260483	0.920732	0.879966	0.741018	0.298858	CLF(n_informative=5, n_redundant=0)	DecisionTreeClassifier	PICV
16	0.273521	0.981474	0.973298	0.812503	0.817674	CLF(n_informative=5, n_redundant=4)	LogisticRegression	PI
17	0.264550	0.946682	0.940374	0.756563	0.520301	CLF(n_informative=5, n_redundant=4)	LogisticRegression	PICV
18	0.250067	0.966972	0.913251	0.768077	0.706813	CLF(n_informative=5, n_redundant=4)	LinearSVC	PI
19	0.304935	0.957164	0.938046	0.744530	0.711278	CLF(n_informative=5, n_redundant=4)	LinearSVC	PICV
20	0.275022	0.989233	0.981340	0.971646	0.749610	CLF(n_informative=5, n_redundant=4)	RandomForestClassifier	PI
21	0.263122	0.955524	0.922541	0.930586	0.383459	CLF(n_informative=5, n_redundant=4)	RandomForestClassifier	PICV
22	0.138652	0.998331	0.998283	0.976383	0.997411	CLF(n_informative=5, n_redundant=4)	DecisionTreeClassifier	PI
23	0.247536	0.944404	0.877370	0.928444	0.173572	CLF(n_informative=5, n_redundant=4)	DecisionTreeClassifier	PICV
24	0.336659	0.990104	0.978667	0.918085	0.875421	CLF(n_informative=1, n_redundant=4)	LogisticRegression	PI
25	0.401469	0.963876	0.954015	0.893001	0.221249	CLF(n_informative=1, n_redundant=4)	LogisticRegression	PICV
26	0.332961	0.992544	0.973712	0.937979	0.845940	CLF(n_informative=1, n_redundant=4)	LinearSVC	PI
27	0.341660	0.971785	0.975554	0.895904	0.352145	CLF(n_informative=1, n_redundant=4)	LinearSVC	PICV
28	0.578752	0.492292	0.122312	-0.288927	0.191127	CLF(n_informative=1, n_redundant=4)	RandomForestClassifier	PI
29	0.460242	0.926089	0.853204	0.717342	0.405126	CLF(n_informative=1, n_redundant=4)	RandomForestClassifier	PICV
...	...	...	...	...	...	...	...	...
58	0.235361	0.991899	1.000000	0.927546	0.737489	REG(n_informative=5)	LinearSVR	PI
59	0.189558	0.980900	0.999842	0.927359	0.506976	REG(n_informative=5)	LinearSVR	PICV
60	0.142030	0.990276	0.984540	0.980935	0.779370	REG(n_informative=5)	RandomForestRegressor	PI
61	0.124729	0.901260	0.907927	0.939584	0.163636	REG(n_informative=5)	RandomForestRegressor	PICV
62	0.107740	0.988112	0.977762	0.975555	0.999033	REG(n_informative=5)	DecisionTreeRegressor	PI
63	0.179533	0.911248	0.942287	0.883271	-0.133398	REG(n_informative=5)	DecisionTreeRegressor	PICV
64	0.367224	0.999991	1.000000	0.967209	0.999629	REG(n_informative=5, effective_rank=2)	LinearRegression	PI
65	0.238332	0.971812	0.959122	0.973212	0.211095	REG(n_informative=5, effective_rank=2)	LinearRegression	PICV
66	0.401022	0.999889	1.000000	0.966992	0.996933	REG(n_informative=5, effective_rank=2)	LinearSVR	PI
67	0.284760	0.980323	0.978061	0.977250	0.509969	REG(n_informative=5, effective_rank=2)	LinearSVR	PICV
68	0.258505	0.995616	0.995906	0.985579	0.833975	REG(n_informative=5, effective_rank=2)	RandomForestRegressor	PI
69	0.297123	0.943655	0.944391	0.974220	0.068419	REG(n_informative=5, effective_rank=2)	RandomForestRegressor	PICV
70	0.067249	0.999453	1.000000	0.993935	0.998769	REG(n_informative=5, effective_rank=2)	DecisionTreeRegressor	PI
71	0.164108	0.978136	0.967044	0.966069	0.342775	REG(n_informative=5, effective_rank=2)	DecisionTreeRegressor	PICV
72	0.452882	0.999998	1.000000	0.998365	0.999654	REG(n_informative=1)	LinearRegression	PI
73	0.314697	0.972598	0.969714	0.998146	0.065281	REG(n_informative=1)	LinearRegression	PICV
74	0.624122	0.992905	0.983652	0.959778	0.900569	REG(n_informative=1)	LinearSVR	PI
75	0.516824	0.979657	0.973410	0.962190	0.541348	REG(n_informative=1)	LinearSVR	PICV
76	0.017653	0.999450	0.998772	0.999990	0.610201	REG(n_informative=1)	RandomForestRegressor	PI
77	0.014198	0.997467	0.997146	0.999982	0.060978	REG(n_informative=1)	RandomForestRegressor	PICV
78	0.003633	0.999976	0.999910	0.999997	0.997796	REG(n_informative=1)	DecisionTreeRegressor	PI
79	0.014089	0.997671	0.995892	0.999954	0.069229	REG(n_informative=1)	DecisionTreeRegressor	PICV
80	0.096921	0.999998	1.000000	0.968106	0.999678	REG(n_informative=20)	LinearRegression	PI
81	0.095981	0.989010	0.984276	0.932558	0.446691	REG(n_informative=20)	LinearRegression	PICV
82	0.120204	0.987079	0.973107	0.899272	0.822808	REG(n_informative=20)	LinearSVR	PI
83	0.125489	0.987371	0.973107	0.904354	0.797415	REG(n_informative=20)	LinearSVR	PICV
84	0.108996	0.992210	0.994836	0.972288	0.876028	REG(n_informative=20)	RandomForestRegressor	PI
85	0.207201	0.780758	0.687252	0.502312	0.156832	REG(n_informative=20)	RandomForestRegressor	PICV
86	0.128705	0.936889	0.886772	0.939217	0.998556	REG(n_informative=20)	DecisionTreeRegressor	PI
87	0.340442	0.612293	0.429809	0.244711	0.032248	REG(n_informative=20)	DecisionTreeRegressor	PICV

	L2	NDCG	NDCG@5	Pearson	SpearmanR
estimator
DecisionTreeClassifier	0.067172	0.997439	0.989606	0.983372	0.990131
DecisionTreeRegressor	0.100825	0.975222	0.963415	0.968772	0.979390
LinearRegression	0.268582	0.999978	0.999885	0.972810	0.997925
LinearSVC	0.327005	0.980478	0.957671	0.848220	0.803290
LinearSVR	0.341993	0.992524	0.989305	0.943220	0.896974
LogisticRegression	0.324789	0.981496	0.973374	0.862891	0.786834
RandomForestClassifier	0.319975	0.875767	0.765293	0.644987	0.659278
RandomForestRegressor	0.103122	0.996088	0.995512	0.988109	0.840017

	L2	NDCG	NDCG@5	Pearson	SpearmanR
dataset
CLF(n_informative=1, n_redundant=4)	0.315954	0.868735	0.768673	0.641759	0.727879
CLF(n_informative=20, n_redundant=0)	0.169107	0.971748	0.920533	0.819741	0.728945
CLF(n_informative=5, n_redundant=0)	0.171585	0.982201	0.964393	0.912143	0.925309
CLF(n_informative=5, n_redundant=4)	0.234315	0.984003	0.966543	0.882152	0.817877
REG(n_informative=1)	0.274572	0.998082	0.995583	0.989532	0.877055
REG(n_informative=20)	0.113707	0.979044	0.963679	0.944721	0.924267
REG(n_informative=5)	0.161041	0.992570	0.990575	0.966880	0.878868
REG(n_informative=5, effective_rank=2)	0.273500	0.998737	0.998977	0.978429	0.957326
boston	0.193210	0.999111	0.998143	0.968828	0.979396
diabetese	0.205752	0.978174	0.975218	0.960977	0.954545
iris_binary	0.407714	0.987288	0.987288	0.918542	0.849409