In [132]:
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [38]:
training_leaderboard_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard'))
len(training_leaderboard_CIDs)


Out[38]:
407

Load and organize the features and descriptor data


In [2]:
descriptors = loading.get_descriptors(format=True)
all_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard')+loading.get_CIDs('testset'))
mdx_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
mdx_drag_morg = dream.get_molecular_data(['dragon','morgan'],all_CIDs)
mdx_drag = dream.get_molecular_data(['dragon'],all_CIDs)


Episuite has 62 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
NSPDK has 6163 features for 476 molecules.
NSPDK Gramian has 2437 features for 476 molecules.
There are now 15969 total features.
Morgan has 2437 features for 476 molecules.
There are now 7307 total features.
There are now 4870 total features.

In [3]:
# Create the feature matrices from the feature dicts.  
from sklearn.preprocessing import Imputer,MinMaxScaler
X_drag,good1,good2,means,stds,imputer = dream.make_X(mdx_drag,['training','leaderboard'])
X_drag_morg,good1,good2,means,stds,imputer = dream.make_X(mdx_drag_morg,['training','leaderboard'])
def quad_prep(mdx):
    X_temp,_,_,_,_,_ = dream.make_X(mdx,['training','leaderboard'],raw=True)
    X_temp[np.isnan(X_temp)] = 0     
    X_scaled = MinMaxScaler().fit_transform(X_temp[:,:-2])
    X_scaled_sq = np.hstack((X_scaled,X_scaled**2,X_temp[:,-2:]))
    return X_scaled_sq
X_drag_sq = quad_prep(mdx_drag)
X_drag_morg_sq = quad_prep(mdx_drag_morg)
Y_all,imputer = dream.make_Y_obs(['training','leaderboard'],target_dilution=None,imputer='mask')


The X matrix now has shape (814x3063) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x5497) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x4871) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x7308) molecules by non-NaN good molecular descriptors
The Y['mean_std'] matrix now has shape (814x42) molecules by 2 x perceptual descriptors
The Y['subject'] dict now has 49 matrices of shape (814x21) molecules by perceptual descriptors, one for each subject

Load Gabor's feature scores and turn them into ranks


In [4]:
lin_importances = np.zeros((250,21,14613)) # 10 splits, 21 descriptors, 14613 features without the leak (7606*2 + negLogD)
lin_ranked = np.zeros((250,21,14613)).astype(int) # Matrix to store the score rankings.  
lin_features = pd.read_csv('../../data/linear_scores_and_features/features_dragon_morgan.csv',nrows=0)
for split in range(250):
    if split % 25 == 0:
        print("Finished loading the first %d splits" % split)
    for desc in range(21):
        scores = pd.read_csv('../../data/linear_scores_and_features/LB_scores_morgan%d/scores_%d.csv' % (split,desc),index_col=0)
        if 'Intensity' in scores.index:
            scores.drop(['0','Intensity'],inplace=1) # Get rid of that weird "0" feature and the leak feature.  
        if 'neglog10d' not in scores.index:
            scores.loc[14612] = 1 # Add in the dilution if it isn't there.  Assume it is as important as possible.  
        scores['int'] = np.arange(14613).astype(int) # Temp index to use for ranking.  
        lin_ranked[split,desc,:] = scores.sort_values(by='0',ascending=0)['int'] # Sort and store the ranking.


Finished loading the first 0 splits
Finished loading the first 25 splits
Finished loading the first 50 splits
Finished loading the first 75 splits
Finished loading the first 100 splits
Finished loading the first 125 splits
Finished loading the first 150 splits
Finished loading the first 175 splits
Finished loading the first 200 splits
Finished loading the first 225 splits

In [5]:
lin_ranked[0,0,:] # The feature ranking for the first split, for intensity. 
                  # negLogD (14612) should appear first if this is working.


Out[5]:
array([14612,  5847,  3552, ...,  5097,  5098,  7306])

Create the train/test splitter and a function for computing the correlation as a function of the number of features


In [25]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
from sklearn.linear_model import RandomizedLasso

n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000]
n_splits = 100

DoubleSS = dream.DoubleSS

Y = Y_all['mean_std'] # Use the perceptual means (second 21 columns are std's which are ignored here)

In [26]:
def feature_sweep(X_all,Y,n_estimators,n_splits=n_splits,n_features=n_features,model='rf',wrong_split=False,max_features='auto',
                  max_depth=None,min_samples_leaf=1,alpha=1.0,rfe=False,use_rick_lin_ranks=False,random_state=0):
    rs = np.ma.zeros((21,len(n_features),n_splits)) # Empty matrix to store correlations.  
    n_obs = int(X_all.shape[0]/2) # Number of molecules.  
    shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in 
                                                                               # train/test_big that I put on GitHub
    
    for col in range(0,21): # For each descriptor.  
        print(col)
        X = X_all[:,:-1] # Remove high/low dilution feature, i.e. remove the leak.
        observed = Y[:,col] # Perceptual data for this descriptor.  
        n_features_ = list(np.array(n_features)+(col==0))
        n_features
        cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.  
        for j,(train,test) in enumerate(cv):
            if model == 'rf': # If the model is random forest regression.  
                if col==0:
                    est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
                                              min_samples_leaf=min_samples_leaf,n_jobs=8,random_state=random_state)
                else:
                    est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
                                              min_samples_leaf=min_samples_leaf,oob_score=False,n_jobs=8,random_state=random_state)
            elif model == 'ridge': # If the model is ridge regression. 
                est = Ridge(alpha=alpha,fit_intercept=True, normalize=False, 
                            copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=random_state)
            if rfe:  
                rfe = RFE(estimator=est, step=n_features_, n_features_to_select=1)
                rfe.fit(X[train,:],observed[train])    
            else:  
                est.fit(X[train,:],observed[train]) # Fit the model on the training data.  
                if model == 'rf':
                    importance_ranks = np.argsort(est.feature_importances_)[::-1] # Use feature importances to get ranks.  
                elif model == 'ridge':
                    if use_rick_lin_ranks:
                        importance_ranks = lin_ranked_rick[int(j+wrong_split) % n_splits,col,:] # Use the pre-computed ranks.
                    else:
                        importance_ranks = lin_ranked[int(j+wrong_split) % n_splits,col,:] # Use the pre-computed ranks.
            for i,n_feat in enumerate(n_features_):
                if col==0:
                    n_feat += 1 # Add one for intensity since negLogD is worthless when all concentrations are 1/1000.  
                if hasattr(est,'max_features') and est.max_features not in [None,'auto']:
                    if n_feat < est.max_features:
                        est.max_features = n_feat
                if rfe:
                    est.fit(X[train,:][:,rfe.ranking_<=(1+i)],observed[train])
                    predicted = est.predict(X[test,:][:,rfe.ranking_<=(1+i)])
                else:
                    #est.max_features = None
                    est.fit(X[train,:][:,importance_ranks[:n_feat]],
                            observed[train]) # Fit the model on the training data with max_features features.
                    predicted = est.predict(X[test,:][:,importance_ranks[:n_feat]]) # Predict the test data.  
                rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Compute the correlation coefficient. 
                
        means = rs[col,:,:].mean(axis=1)
        sems = rs[col,:,:].std(axis=1)/np.sqrt(n_splits)
    return rs

Compute and plot the correlations using Gabor's feature scores/ranks


In [8]:
# The right way (with feature scores and data from the same splits)
rs_lin_right = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',alpha=1.0,rfe=False)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

In [9]:
# The wrong way (with feature scores and data from different splits)
rs_lin_wrong = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=True,alpha=1.0,rfe=False)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

In [21]:
def plot(right,wrong,labels=['right','wrong']):
    fig,axes = plt.subplots(3,7,sharex=False,sharey=True,figsize=(20,10))
    for col,ax in enumerate(axes.flat):
        ax.errorbar(n_features,right[col,:,:].mean(axis=1),right[col,:,:].std(axis=1)/np.sqrt(n_splits),color='red',label=labels[0])
        ax.errorbar(n_features,wrong[col,:,:].mean(axis=1),wrong[col,:,:].std(axis=1)/np.sqrt(n_splits),color='blue',label=labels[1])
        if col==0:
            handles, labels = ax.get_legend_handles_labels()
            lg = ax.legend(handles[0:], labels[0:], loc=4, fontsize=16)
            lg.draw_frame(False)

        ax.set_xlim(0.5,20000)
        ax.set_ylim(0,0.8)
        ax.set_yticks(np.linspace(0,0.6,4))
        ax.set_yticklabels([_ for _ in np.linspace(0,0.6,4)],size=20)
        ax.set_xticklabels(n_features,size=20)
        ax.set_xscale('log')
        ax.set_title(descriptors[col], size=25)
    plt.tight_layout()
    fig.text(0.5, -0.025, 'Number of features', ha='center', size=25)
    fig.text(-0.02, 0.5, 'Correlation', va='center', rotation='vertical', size=25);

In [12]:
plot(rs_lin_right,rs_lin_wrong)


/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/numpy/ma/core.py:4139: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

Now let's create the splits from scratch


In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [27]:
lin_ranked_rick = np.zeros((n_splits,21,14613)).astype(int) # Matrix to store the score rankings.  
rl = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=10,random_state=25,n_jobs=1)
n_obs = int(X_drag_morg_sq.shape[0]/2) # Number of molecules.  
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in 
                                                                           # train/test_big that I put on GitHub
X_all = X_drag_morg_sq    
for col in range(21):
    cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.  
    for j,(train,test) in enumerate(cv):
        print(descriptors[col],j)
        observed = Y[train,col]
        X = X_all[train,:-1] # Remove leak feature
        rl.fit(X,observed)
        lin_ranked_rick[j,col,:] = np.argsort(rl.all_scores_.ravel())[::-1]


Intensity 0
Intensity 1
Intensity 2
Intensity 3
Intensity 4
Intensity 5
Intensity 6
Intensity 7
Intensity 8
Intensity 9
Intensity 10
Intensity 11
Intensity 12
Intensity 13
Intensity 14
Intensity 15
Intensity 16
Intensity 17
Intensity 18
Intensity 19
Intensity 20
Intensity 21
Intensity 22
Intensity 23
Intensity 24
Intensity 25
Intensity 26
Intensity 27
Intensity 28
Intensity 29
Intensity 30
Intensity 31
Intensity 32
Intensity 33
Intensity 34
Intensity 35
Intensity 36
Intensity 37
Intensity 38
Intensity 39
Intensity 40
Intensity 41
Intensity 42
Intensity 43
Intensity 44
Intensity 45
Intensity 46
Intensity 47
Intensity 48
Intensity 49
Intensity 50
Intensity 51
Intensity 52
Intensity 53
Intensity 54
Intensity 55
Intensity 56
Intensity 57
Intensity 58
Intensity 59
Intensity 60
Intensity 61
Intensity 62
Intensity 63
Intensity 64
Intensity 65
Intensity 66
Intensity 67
Intensity 68
Intensity 69
Intensity 70
Intensity 71
Intensity 72
Intensity 73
Intensity 74
Intensity 75
Intensity 76
Intensity 77
Intensity 78
Intensity 79
Intensity 80
Intensity 81
Intensity 82
Intensity 83
Intensity 84
Intensity 85
Intensity 86
Intensity 87
Intensity 88
Intensity 89
Intensity 90
Intensity 91
Intensity 92
Intensity 93
Intensity 94
Intensity 95
Intensity 96
Intensity 97
Intensity 98
Intensity 99
Pleasantness 0
Pleasantness 1
Pleasantness 2
Pleasantness 3
Pleasantness 4
Pleasantness 5
Pleasantness 6
Pleasantness 7
Pleasantness 8
Pleasantness 9
Pleasantness 10
Pleasantness 11
Pleasantness 12
Pleasantness 13
Pleasantness 14
Pleasantness 15
Pleasantness 16
Pleasantness 17
Pleasantness 18
Pleasantness 19
Pleasantness 20
Pleasantness 21
Pleasantness 22
Pleasantness 23
Pleasantness 24
Pleasantness 25
Pleasantness 26
Pleasantness 27
Pleasantness 28
Pleasantness 29
Pleasantness 30
Pleasantness 31
Pleasantness 32
Pleasantness 33
Pleasantness 34
Pleasantness 35
Pleasantness 36
Pleasantness 37
Pleasantness 38
Pleasantness 39
Pleasantness 40
Pleasantness 41
Pleasantness 42
Pleasantness 43
Pleasantness 44
Pleasantness 45
Pleasantness 46
Pleasantness 47
Pleasantness 48
Pleasantness 49
Pleasantness 50
Pleasantness 51
Pleasantness 52
Pleasantness 53
Pleasantness 54
Pleasantness 55
Pleasantness 56
Pleasantness 57
Pleasantness 58
Pleasantness 59
Pleasantness 60
Pleasantness 61
Pleasantness 62
Pleasantness 63
Pleasantness 64
Pleasantness 65
Pleasantness 66
Pleasantness 67
Pleasantness 68
Pleasantness 69
Pleasantness 70
Pleasantness 71
Pleasantness 72
Pleasantness 73
Pleasantness 74
Pleasantness 75
Pleasantness 76
Pleasantness 77
Pleasantness 78
Pleasantness 79
Pleasantness 80
Pleasantness 81
Pleasantness 82
Pleasantness 83
Pleasantness 84
Pleasantness 85
Pleasantness 86
Pleasantness 87
Pleasantness 88
Pleasantness 89
Pleasantness 90
Pleasantness 91
Pleasantness 92
Pleasantness 93
Pleasantness 94
Pleasantness 95
Pleasantness 96
Pleasantness 97
Pleasantness 98
Pleasantness 99
Bakery 0
Bakery 1
Bakery 2
Bakery 3
Bakery 4
Bakery 5
Bakery 6
Bakery 7
Bakery 8
Bakery 9
Bakery 10
Bakery 11
Bakery 12
Bakery 13
Bakery 14
Bakery 15
Bakery 16
Bakery 17
Bakery 18
Bakery 19
Bakery 20
Bakery 21
Bakery 22
Bakery 23
Bakery 24
Bakery 25
Bakery 26
Bakery 27
Bakery 28
Bakery 29
Bakery 30
Bakery 31
Bakery 32
Bakery 33
Bakery 34
Bakery 35
Bakery 36
Bakery 37
Bakery 38
Bakery 39
Bakery 40
Bakery 41
Bakery 42
Bakery 43
Bakery 44
Bakery 45
Bakery 46
Bakery 47
Bakery 48
Bakery 49
Bakery 50
Bakery 51
Bakery 52
Bakery 53
Bakery 54
Bakery 55
Bakery 56
Bakery 57
Bakery 58
Bakery 59
Bakery 60
Bakery 61
Bakery 62
Bakery 63
Bakery 64
Bakery 65
Bakery 66
Bakery 67
Bakery 68
Bakery 69
Bakery 70
Bakery 71
Bakery 72
Bakery 73
Bakery 74
Bakery 75
Bakery 76
Bakery 77
Bakery 78
Bakery 79
Bakery 80
Bakery 81
Bakery 82
Bakery 83
Bakery 84
Bakery 85
Bakery 86
Bakery 87
Bakery 88
Bakery 89
Bakery 90
Bakery 91
Bakery 92
Bakery 93
Bakery 94
Bakery 95
Bakery 96
Bakery 97
Bakery 98
Bakery 99
Sweet 0
Sweet 1
Sweet 2
Sweet 3
Sweet 4
Sweet 5
Sweet 6
Sweet 7
Sweet 8
Sweet 9
Sweet 10
Sweet 11
Sweet 12
Sweet 13
Sweet 14
Sweet 15
Sweet 16
Sweet 17
Sweet 18
Sweet 19
Sweet 20
Sweet 21
Sweet 22
Sweet 23
Sweet 24
Sweet 25
Sweet 26
Sweet 27
Sweet 28
Sweet 29
Sweet 30
Sweet 31
Sweet 32
Sweet 33
Sweet 34
Sweet 35
Sweet 36
Sweet 37
Sweet 38
Sweet 39
Sweet 40
Sweet 41
Sweet 42
Sweet 43
Sweet 44
Sweet 45
Sweet 46
Sweet 47
Sweet 48
Sweet 49
Sweet 50
Sweet 51
Sweet 52
Sweet 53
Sweet 54
Sweet 55
Sweet 56
Sweet 57
Sweet 58
Sweet 59
Sweet 60
Sweet 61
Sweet 62
Sweet 63
Sweet 64
Sweet 65
Sweet 66
Sweet 67
Sweet 68
Sweet 69
Sweet 70
Sweet 71
Sweet 72
Sweet 73
Sweet 74
Sweet 75
Sweet 76
Sweet 77
Sweet 78
Sweet 79
Sweet 80
Sweet 81
Sweet 82
Sweet 83
Sweet 84
Sweet 85
Sweet 86
Sweet 87
Sweet 88
Sweet 89
Sweet 90
Sweet 91
Sweet 92
Sweet 93
Sweet 94
Sweet 95
Sweet 96
Sweet 97
Sweet 98
Sweet 99
Fruit 0
Fruit 1
Fruit 2
Fruit 3
Fruit 4
Fruit 5
Fruit 6
Fruit 7
Fruit 8
Fruit 9
Fruit 10
Fruit 11
Fruit 12
Fruit 13
Fruit 14
Fruit 15
Fruit 16
Fruit 17
Fruit 18
Fruit 19
Fruit 20
Fruit 21
Fruit 22
Fruit 23
Fruit 24
Fruit 25
Fruit 26
Fruit 27
Fruit 28
Fruit 29
Fruit 30
Fruit 31
Fruit 32
Fruit 33
Fruit 34
Fruit 35
Fruit 36
Fruit 37
Fruit 38
Fruit 39
Fruit 40
Fruit 41
Fruit 42
Fruit 43
Fruit 44
Fruit 45
Fruit 46
Fruit 47
Fruit 48
Fruit 49
Fruit 50
Fruit 51
Fruit 52
Fruit 53
Fruit 54
Fruit 55
Fruit 56
Fruit 57
Fruit 58
Fruit 59
Fruit 60
Fruit 61
Fruit 62
Fruit 63
Fruit 64
Fruit 65
Fruit 66
Fruit 67
Fruit 68
Fruit 69
Fruit 70
Fruit 71
Fruit 72
Fruit 73
Fruit 74
Fruit 75
Fruit 76
Fruit 77
Fruit 78
Fruit 79
Fruit 80
Fruit 81
Fruit 82
Fruit 83
Fruit 84
Fruit 85
Fruit 86
Fruit 87
Fruit 88
Fruit 89
Fruit 90
Fruit 91
Fruit 92
Fruit 93
Fruit 94
Fruit 95
Fruit 96
Fruit 97
Fruit 98
Fruit 99
Fish 0
Fish 1
Fish 2
Fish 3
Fish 4
Fish 5
Fish 6
Fish 7
Fish 8
Fish 9
Fish 10
Fish 11
Fish 12
Fish 13
Fish 14
Fish 15
Fish 16
Fish 17
Fish 18
Fish 19
Fish 20
Fish 21
Fish 22
Fish 23
Fish 24
Fish 25
Fish 26
Fish 27
Fish 28
Fish 29
Fish 30
Fish 31
Fish 32
Fish 33
Fish 34
Fish 35
Fish 36
Fish 37
Fish 38
Fish 39
Fish 40
Fish 41
Fish 42
Fish 43
Fish 44
Fish 45
Fish 46
Fish 47
Fish 48
Fish 49
Fish 50
Fish 51
Fish 52
Fish 53
Fish 54
Fish 55
Fish 56
Fish 57
Fish 58
Fish 59
Fish 60
Fish 61
Fish 62
Fish 63
Fish 64
Fish 65
Fish 66
Fish 67
Fish 68
Fish 69
Fish 70
Fish 71
Fish 72
Fish 73
Fish 74
Fish 75
Fish 76
Fish 77
Fish 78
Fish 79
Fish 80
Fish 81
Fish 82
Fish 83
Fish 84
Fish 85
Fish 86
Fish 87
Fish 88
Fish 89
Fish 90
Fish 91
Fish 92
Fish 93
Fish 94
Fish 95
Fish 96
Fish 97
Fish 98
Fish 99
Garlic 0
Garlic 1
Garlic 2
Garlic 3
Garlic 4
Garlic 5
Garlic 6
Garlic 7
Garlic 8
Garlic 9
Garlic 10
Garlic 11
Garlic 12
Garlic 13
Garlic 14
Garlic 15
Garlic 16
Garlic 17
Garlic 18
Garlic 19
Garlic 20
Garlic 21
Garlic 22
Garlic 23
Garlic 24
Garlic 25
Garlic 26
Garlic 27
Garlic 28
Garlic 29
Garlic 30
Garlic 31
Garlic 32
Garlic 33
Garlic 34
Garlic 35
Garlic 36
Garlic 37
Garlic 38
Garlic 39
Garlic 40
Garlic 41
Garlic 42
Garlic 43
Garlic 44
Garlic 45
Garlic 46
Garlic 47
Garlic 48
Garlic 49
Garlic 50
Garlic 51
Garlic 52
Garlic 53
Garlic 54
Garlic 55
Garlic 56
Garlic 57
Garlic 58
Garlic 59
Garlic 60
Garlic 61
Garlic 62
Garlic 63
Garlic 64
Garlic 65
Garlic 66
Garlic 67
Garlic 68
Garlic 69
Garlic 70
Garlic 71
Garlic 72
Garlic 73
Garlic 74
Garlic 75
Garlic 76
Garlic 77
Garlic 78
Garlic 79
Garlic 80
Garlic 81
Garlic 82
Garlic 83
Garlic 84
Garlic 85
Garlic 86
Garlic 87
Garlic 88
Garlic 89
Garlic 90
Garlic 91
Garlic 92
Garlic 93
Garlic 94
Garlic 95
Garlic 96
Garlic 97
Garlic 98
Garlic 99
Spices 0
Spices 1
Spices 2
Spices 3
Spices 4
Spices 5
Spices 6
Spices 7
Spices 8
Spices 9
Spices 10
Spices 11
Spices 12
Spices 13
Spices 14
Spices 15
Spices 16
Spices 17
Spices 18
Spices 19
Spices 20
Spices 21
Spices 22
Spices 23
Spices 24
Spices 25
Spices 26
Spices 27
Spices 28
Spices 29
Spices 30
Spices 31
Spices 32
Spices 33
Spices 34
Spices 35
Spices 36
Spices 37
Spices 38
Spices 39
Spices 40
Spices 41
Spices 42
Spices 43
Spices 44
Spices 45
Spices 46
Spices 47
Spices 48
Spices 49
Spices 50
Spices 51
Spices 52
Spices 53
Spices 54
Spices 55
Spices 56
Spices 57
Spices 58
Spices 59
Spices 60
Spices 61
Spices 62
Spices 63
Spices 64
Spices 65
Spices 66
Spices 67
Spices 68
Spices 69
Spices 70
Spices 71
Spices 72
Spices 73
Spices 74
Spices 75
Spices 76
Spices 77
Spices 78
Spices 79
Spices 80
Spices 81
Spices 82
Spices 83
Spices 84
Spices 85
Spices 86
Spices 87
Spices 88
Spices 89
Spices 90
Spices 91
Spices 92
Spices 93
Spices 94
Spices 95
Spices 96
Spices 97
Spices 98
Spices 99
Cold 0
Cold 1
Cold 2
Cold 3
Cold 4
Cold 5
Cold 6
Cold 7
Cold 8
Cold 9
Cold 10
Cold 11
Cold 12
Cold 13
Cold 14
Cold 15
Cold 16
Cold 17
Cold 18
Cold 19
Cold 20
Cold 21
Cold 22
Cold 23
Cold 24
Cold 25
Cold 26
Cold 27
Cold 28
Cold 29
Cold 30
Cold 31
Cold 32
Cold 33
Cold 34
Cold 35
Cold 36
Cold 37
Cold 38
Cold 39
Cold 40
Cold 41
Cold 42
Cold 43
Cold 44
Cold 45
Cold 46
Cold 47
Cold 48
Cold 49
Cold 50
Cold 51
Cold 52
Cold 53
Cold 54
Cold 55
Cold 56
Cold 57
Cold 58
Cold 59
Cold 60
Cold 61
Cold 62
Cold 63
Cold 64
Cold 65
Cold 66
Cold 67
Cold 68
Cold 69
Cold 70
Cold 71
Cold 72
Cold 73
Cold 74
Cold 75
Cold 76
Cold 77
Cold 78
Cold 79
Cold 80
Cold 81
Cold 82
Cold 83
Cold 84
Cold 85
Cold 86
Cold 87
Cold 88
Cold 89
Cold 90
Cold 91
Cold 92
Cold 93
Cold 94
Cold 95
Cold 96
Cold 97
Cold 98
Cold 99
Sour 0
Sour 1
Sour 2
Sour 3
Sour 4
Sour 5
Sour 6
Sour 7
Sour 8
Sour 9
Sour 10
Sour 11
Sour 12
Sour 13
Sour 14
Sour 15
Sour 16
Sour 17
Sour 18
Sour 19
Sour 20
Sour 21
Sour 22
Sour 23
Sour 24
Sour 25
Sour 26
Sour 27
Sour 28
Sour 29
Sour 30
Sour 31
Sour 32
Sour 33
Sour 34
Sour 35
Sour 36
Sour 37
Sour 38
Sour 39
Sour 40
Sour 41
Sour 42
Sour 43
Sour 44
Sour 45
Sour 46
Sour 47
Sour 48
Sour 49
Sour 50
Sour 51
Sour 52
Sour 53
Sour 54
Sour 55
Sour 56
Sour 57
Sour 58
Sour 59
Sour 60
Sour 61
Sour 62
Sour 63
Sour 64
Sour 65
Sour 66
Sour 67
Sour 68
Sour 69
Sour 70
Sour 71
Sour 72
Sour 73
Sour 74
Sour 75
Sour 76
Sour 77
Sour 78
Sour 79
Sour 80
Sour 81
Sour 82
Sour 83
Sour 84
Sour 85
Sour 86
Sour 87
Sour 88
Sour 89
Sour 90
Sour 91
Sour 92
Sour 93
Sour 94
Sour 95
Sour 96
Sour 97
Sour 98
Sour 99
Burnt 0
Burnt 1
Burnt 2
Burnt 3
Burnt 4
Burnt 5
Burnt 6
Burnt 7
Burnt 8
Burnt 9
Burnt 10
Burnt 11
Burnt 12
Burnt 13
Burnt 14
Burnt 15
Burnt 16
Burnt 17
Burnt 18
Burnt 19
Burnt 20
Burnt 21
Burnt 22
Burnt 23
Burnt 24
Burnt 25
Burnt 26
Burnt 27
Burnt 28
Burnt 29
Burnt 30
Burnt 31
Burnt 32
Burnt 33
Burnt 34
Burnt 35
Burnt 36
Burnt 37
Burnt 38
Burnt 39
Burnt 40
Burnt 41
Burnt 42
Burnt 43
Burnt 44
Burnt 45
Burnt 46
Burnt 47
Burnt 48
Burnt 49
Burnt 50
Burnt 51
Burnt 52
Burnt 53
Burnt 54
Burnt 55
Burnt 56
Burnt 57
Burnt 58
Burnt 59
Burnt 60
Burnt 61
Burnt 62
Burnt 63
Burnt 64
Burnt 65
Burnt 66
Burnt 67
Burnt 68
Burnt 69
Burnt 70
Burnt 71
Burnt 72
Burnt 73
Burnt 74
Burnt 75
Burnt 76
Burnt 77
Burnt 78
Burnt 79
Burnt 80
Burnt 81
Burnt 82
Burnt 83
Burnt 84
Burnt 85
Burnt 86
Burnt 87
Burnt 88
Burnt 89
Burnt 90
Burnt 91
Burnt 92
Burnt 93
Burnt 94
Burnt 95
Burnt 96
Burnt 97
Burnt 98
Burnt 99
Acid 0
Acid 1
Acid 2
Acid 3
Acid 4
Acid 5
Acid 6
Acid 7
Acid 8
Acid 9
Acid 10
Acid 11
Acid 12
Acid 13
Acid 14
Acid 15
Acid 16
Acid 17
Acid 18
Acid 19
Acid 20
Acid 21
Acid 22
Acid 23
Acid 24
Acid 25
Acid 26
Acid 27
Acid 28
Acid 29
Acid 30
Acid 31
Acid 32
Acid 33
Acid 34
Acid 35
Acid 36
Acid 37
Acid 38
Acid 39
Acid 40
Acid 41
Acid 42
Acid 43
Acid 44
Acid 45
Acid 46
Acid 47
Acid 48
Acid 49
Acid 50
Acid 51
Acid 52
Acid 53
Acid 54
Acid 55
Acid 56
Acid 57
Acid 58
Acid 59
Acid 60
Acid 61
Acid 62
Acid 63
Acid 64
Acid 65
Acid 66
Acid 67
Acid 68
Acid 69
Acid 70
Acid 71
Acid 72
Acid 73
Acid 74
Acid 75
Acid 76
Acid 77
Acid 78
Acid 79
Acid 80
Acid 81
Acid 82
Acid 83
Acid 84
Acid 85
Acid 86
Acid 87
Acid 88
Acid 89
Acid 90
Acid 91
Acid 92
Acid 93
Acid 94
Acid 95
Acid 96
Acid 97
Acid 98
Acid 99
Warm 0
Warm 1
Warm 2
Warm 3
Warm 4
Warm 5
Warm 6
Warm 7
Warm 8
Warm 9
Warm 10
Warm 11
Warm 12
Warm 13
Warm 14
Warm 15
Warm 16
Warm 17
Warm 18
Warm 19
Warm 20
Warm 21
Warm 22
Warm 23
Warm 24
Warm 25
Warm 26
Warm 27
Warm 28
Warm 29
Warm 30
Warm 31
Warm 32
Warm 33
Warm 34
Warm 35
Warm 36
Warm 37
Warm 38
Warm 39
Warm 40
Warm 41
Warm 42
Warm 43
Warm 44
Warm 45
Warm 46
Warm 47
Warm 48
Warm 49
Warm 50
Warm 51
Warm 52
Warm 53
Warm 54
Warm 55
Warm 56
Warm 57
Warm 58
Warm 59
Warm 60
Warm 61
Warm 62
Warm 63
Warm 64
Warm 65
Warm 66
Warm 67
Warm 68
Warm 69
Warm 70
Warm 71
Warm 72
Warm 73
Warm 74
Warm 75
Warm 76
Warm 77
Warm 78
Warm 79
Warm 80
Warm 81
Warm 82
Warm 83
Warm 84
Warm 85
Warm 86
Warm 87
Warm 88
Warm 89
Warm 90
Warm 91
Warm 92
Warm 93
Warm 94
Warm 95
Warm 96
Warm 97
Warm 98
Warm 99
Musky 0
Musky 1
Musky 2
Musky 3
Musky 4
Musky 5
Musky 6
Musky 7
Musky 8
Musky 9
Musky 10
Musky 11
Musky 12
Musky 13
Musky 14
Musky 15
Musky 16
Musky 17
Musky 18
Musky 19
Musky 20
Musky 21
Musky 22
Musky 23
Musky 24
Musky 25
Musky 26
Musky 27
Musky 28
Musky 29
Musky 30
Musky 31
Musky 32
Musky 33
Musky 34
Musky 35
Musky 36
Musky 37
Musky 38
Musky 39
Musky 40
Musky 41
Musky 42
Musky 43
Musky 44
Musky 45
Musky 46
Musky 47
Musky 48
Musky 49
Musky 50
Musky 51
Musky 52
Musky 53
Musky 54
Musky 55
Musky 56
Musky 57
Musky 58
Musky 59
Musky 60
Musky 61
Musky 62
Musky 63
Musky 64
Musky 65
Musky 66
Musky 67
Musky 68
Musky 69
Musky 70
Musky 71
Musky 72
Musky 73
Musky 74
Musky 75
Musky 76
Musky 77
Musky 78
Musky 79
Musky 80
Musky 81
Musky 82
Musky 83
Musky 84
Musky 85
Musky 86
Musky 87
Musky 88
Musky 89
Musky 90
Musky 91
Musky 92
Musky 93
Musky 94
Musky 95
Musky 96
Musky 97
Musky 98
Musky 99
Sweaty 0
Sweaty 1
Sweaty 2
Sweaty 3
Sweaty 4
Sweaty 5
Sweaty 6
Sweaty 7
Sweaty 8
Sweaty 9
Sweaty 10
Sweaty 11
Sweaty 12
Sweaty 13
Sweaty 14
Sweaty 15
Sweaty 16
Sweaty 17
Sweaty 18
Sweaty 19
Sweaty 20
Sweaty 21
Sweaty 22
Sweaty 23
Sweaty 24
Sweaty 25
Sweaty 26
Sweaty 27
Sweaty 28
Sweaty 29
Sweaty 30
Sweaty 31
Sweaty 32
Sweaty 33
Sweaty 34
Sweaty 35
Sweaty 36
Sweaty 37
Sweaty 38
Sweaty 39
Sweaty 40
Sweaty 41
Sweaty 42
Sweaty 43
Sweaty 44
Sweaty 45
Sweaty 46
Sweaty 47
Sweaty 48
Sweaty 49
Sweaty 50
Sweaty 51
Sweaty 52
Sweaty 53
Sweaty 54
Sweaty 55
Sweaty 56
Sweaty 57
Sweaty 58
Sweaty 59
Sweaty 60
Sweaty 61
Sweaty 62
Sweaty 63
Sweaty 64
Sweaty 65
Sweaty 66
Sweaty 67
Sweaty 68
Sweaty 69
Sweaty 70
Sweaty 71
Sweaty 72
Sweaty 73
Sweaty 74
Sweaty 75
Sweaty 76
Sweaty 77
Sweaty 78
Sweaty 79
Sweaty 80
Sweaty 81
Sweaty 82
Sweaty 83
Sweaty 84
Sweaty 85
Sweaty 86
Sweaty 87
Sweaty 88
Sweaty 89
Sweaty 90
Sweaty 91
Sweaty 92
Sweaty 93
Sweaty 94
Sweaty 95
Sweaty 96
Sweaty 97
Sweaty 98
Sweaty 99
Ammonia 0
Ammonia 1
Ammonia 2
Ammonia 3
Ammonia 4
Ammonia 5
Ammonia 6
Ammonia 7
Ammonia 8
Ammonia 9
Ammonia 10
Ammonia 11
Ammonia 12
Ammonia 13
Ammonia 14
Ammonia 15
Ammonia 16
Ammonia 17
Ammonia 18
Ammonia 19
Ammonia 20
Ammonia 21
Ammonia 22
Ammonia 23
Ammonia 24
Ammonia 25
Ammonia 26
Ammonia 27
Ammonia 28
Ammonia 29
Ammonia 30
Ammonia 31
Ammonia 32
Ammonia 33
Ammonia 34
Ammonia 35
Ammonia 36
Ammonia 37
Ammonia 38
Ammonia 39
Ammonia 40
Ammonia 41
Ammonia 42
Ammonia 43
Ammonia 44
Ammonia 45
Ammonia 46
Ammonia 47
Ammonia 48
Ammonia 49
Ammonia 50
Ammonia 51
Ammonia 52
Ammonia 53
Ammonia 54
Ammonia 55
Ammonia 56
Ammonia 57
Ammonia 58
Ammonia 59
Ammonia 60
Ammonia 61
Ammonia 62
Ammonia 63
Ammonia 64
Ammonia 65
Ammonia 66
Ammonia 67
Ammonia 68
Ammonia 69
Ammonia 70
Ammonia 71
Ammonia 72
Ammonia 73
Ammonia 74
Ammonia 75
Ammonia 76
Ammonia 77
Ammonia 78
Ammonia 79
Ammonia 80
Ammonia 81
Ammonia 82
Ammonia 83
Ammonia 84
Ammonia 85
Ammonia 86
Ammonia 87
Ammonia 88
Ammonia 89
Ammonia 90
Ammonia 91
Ammonia 92
Ammonia 93
Ammonia 94
Ammonia 95
Ammonia 96
Ammonia 97
Ammonia 98
Ammonia 99
Decayed 0
Decayed 1
Decayed 2
Decayed 3
Decayed 4
Decayed 5
Decayed 6
Decayed 7
Decayed 8
Decayed 9
Decayed 10
Decayed 11
Decayed 12
Decayed 13
Decayed 14
Decayed 15
Decayed 16
Decayed 17
Decayed 18
Decayed 19
Decayed 20
Decayed 21
Decayed 22
Decayed 23
Decayed 24
Decayed 25
Decayed 26
Decayed 27
Decayed 28
Decayed 29
Decayed 30
Decayed 31
Decayed 32
Decayed 33
Decayed 34
Decayed 35
Decayed 36
Decayed 37
Decayed 38
Decayed 39
Decayed 40
Decayed 41
Decayed 42
Decayed 43
Decayed 44
Decayed 45
Decayed 46
Decayed 47
Decayed 48
Decayed 49
Decayed 50
Decayed 51
Decayed 52
Decayed 53
Decayed 54
Decayed 55
Decayed 56
Decayed 57
Decayed 58
Decayed 59
Decayed 60
Decayed 61
Decayed 62
Decayed 63
Decayed 64
Decayed 65
Decayed 66
Decayed 67
Decayed 68
Decayed 69
Decayed 70
Decayed 71
Decayed 72
Decayed 73
Decayed 74
Decayed 75
Decayed 76
Decayed 77
Decayed 78
Decayed 79
Decayed 80
Decayed 81
Decayed 82
Decayed 83
Decayed 84
Decayed 85
Decayed 86
Decayed 87
Decayed 88
Decayed 89
Decayed 90
Decayed 91
Decayed 92
Decayed 93
Decayed 94
Decayed 95
Decayed 96
Decayed 97
Decayed 98
Decayed 99
Wood 0
Wood 1
Wood 2
Wood 3
Wood 4
Wood 5
Wood 6
Wood 7
Wood 8
Wood 9
Wood 10
Wood 11
Wood 12
Wood 13
Wood 14
Wood 15
Wood 16
Wood 17
Wood 18
Wood 19
Wood 20
Wood 21
Wood 22
Wood 23
Wood 24
Wood 25
Wood 26
Wood 27
Wood 28
Wood 29
Wood 30
Wood 31
Wood 32
Wood 33
Wood 34
Wood 35
Wood 36
Wood 37
Wood 38
Wood 39
Wood 40
Wood 41
Wood 42
Wood 43
Wood 44
Wood 45
Wood 46
Wood 47
Wood 48
Wood 49
Wood 50
Wood 51
Wood 52
Wood 53
Wood 54
Wood 55
Wood 56
Wood 57
Wood 58
Wood 59
Wood 60
Wood 61
Wood 62
Wood 63
Wood 64
Wood 65
Wood 66
Wood 67
Wood 68
Wood 69
Wood 70
Wood 71
Wood 72
Wood 73
Wood 74
Wood 75
Wood 76
Wood 77
Wood 78
Wood 79
Wood 80
Wood 81
Wood 82
Wood 83
Wood 84
Wood 85
Wood 86
Wood 87
Wood 88
Wood 89
Wood 90
Wood 91
Wood 92
Wood 93
Wood 94
Wood 95
Wood 96
Wood 97
Wood 98
Wood 99
Grass 0
Grass 1
Grass 2
Grass 3
Grass 4
Grass 5
Grass 6
Grass 7
Grass 8
Grass 9
Grass 10
Grass 11
Grass 12
Grass 13
Grass 14
Grass 15
Grass 16
Grass 17
Grass 18
Grass 19
Grass 20
Grass 21
Grass 22
Grass 23
Grass 24
Grass 25
Grass 26
Grass 27
Grass 28
Grass 29
Grass 30
Grass 31
Grass 32
Grass 33
Grass 34
Grass 35
Grass 36
Grass 37
Grass 38
Grass 39
Grass 40
Grass 41
Grass 42
Grass 43
Grass 44
Grass 45
Grass 46
Grass 47
Grass 48
Grass 49
Grass 50
Grass 51
Grass 52
Grass 53
Grass 54
Grass 55
Grass 56
Grass 57
Grass 58
Grass 59
Grass 60
Grass 61
Grass 62
Grass 63
Grass 64
Grass 65
Grass 66
Grass 67
Grass 68
Grass 69
Grass 70
Grass 71
Grass 72
Grass 73
Grass 74
Grass 75
Grass 76
Grass 77
Grass 78
Grass 79
Grass 80
Grass 81
Grass 82
Grass 83
Grass 84
Grass 85
Grass 86
Grass 87
Grass 88
Grass 89
Grass 90
Grass 91
Grass 92
Grass 93
Grass 94
Grass 95
Grass 96
Grass 97
Grass 98
Grass 99
Flower 0
Flower 1
Flower 2
Flower 3
Flower 4
Flower 5
Flower 6
Flower 7
Flower 8
Flower 9
Flower 10
Flower 11
Flower 12
Flower 13
Flower 14
Flower 15
Flower 16
Flower 17
Flower 18
Flower 19
Flower 20
Flower 21
Flower 22
Flower 23
Flower 24
Flower 25
Flower 26
Flower 27
Flower 28
Flower 29
Flower 30
Flower 31
Flower 32
Flower 33
Flower 34
Flower 35
Flower 36
Flower 37
Flower 38
Flower 39
Flower 40
Flower 41
Flower 42
Flower 43
Flower 44
Flower 45
Flower 46
Flower 47
Flower 48
Flower 49
Flower 50
Flower 51
Flower 52
Flower 53
Flower 54
Flower 55
Flower 56
Flower 57
Flower 58
Flower 59
Flower 60
Flower 61
Flower 62
Flower 63
Flower 64
Flower 65
Flower 66
Flower 67
Flower 68
Flower 69
Flower 70
Flower 71
Flower 72
Flower 73
Flower 74
Flower 75
Flower 76
Flower 77
Flower 78
Flower 79
Flower 80
Flower 81
Flower 82
Flower 83
Flower 84
Flower 85
Flower 86
Flower 87
Flower 88
Flower 89
Flower 90
Flower 91
Flower 92
Flower 93
Flower 94
Flower 95
Flower 96
Flower 97
Flower 98
Flower 99
Chemical 0
Chemical 1
Chemical 2
Chemical 3
Chemical 4
Chemical 5
Chemical 6
Chemical 7
Chemical 8
Chemical 9
Chemical 10
Chemical 11
Chemical 12
Chemical 13
Chemical 14
Chemical 15
Chemical 16
Chemical 17
Chemical 18
Chemical 19
Chemical 20
Chemical 21
Chemical 22
Chemical 23
Chemical 24
Chemical 25
Chemical 26
Chemical 27
Chemical 28
Chemical 29
Chemical 30
Chemical 31
Chemical 32
Chemical 33
Chemical 34
Chemical 35
Chemical 36
Chemical 37
Chemical 38
Chemical 39
Chemical 40
Chemical 41
Chemical 42
Chemical 43
Chemical 44
Chemical 45
Chemical 46
Chemical 47
Chemical 48
Chemical 49
Chemical 50
Chemical 51
Chemical 52
Chemical 53
Chemical 54
Chemical 55
Chemical 56
Chemical 57
Chemical 58
Chemical 59
Chemical 60
Chemical 61
Chemical 62
Chemical 63
Chemical 64
Chemical 65
Chemical 66
Chemical 67
Chemical 68
Chemical 69
Chemical 70
Chemical 71
Chemical 72
Chemical 73
Chemical 74
Chemical 75
Chemical 76
Chemical 77
Chemical 78
Chemical 79
Chemical 80
Chemical 81
Chemical 82
Chemical 83
Chemical 84
Chemical 85
Chemical 86
Chemical 87
Chemical 88
Chemical 89
Chemical 90
Chemical 91
Chemical 92
Chemical 93
Chemical 94
Chemical 95
Chemical 96
Chemical 97
Chemical 98
Chemical 99

In [29]:
#np.save('../../data/lin_ranked_rick',lin_ranked_rick)

Compute and plot the correlations using my (Rick's) feature scores/ranks


In [88]:
rs_lin_rick_right_10 = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=False,alpha=10.0,rfe=False,use_rick_lin_ranks=True)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

In [89]:
rs_lin_rick_wrong_10 = feature_sweep(X_drag_morg_sq,Y,50,n_splits=n_splits,model='ridge',wrong_split=True,alpha=10.0,rfe=False,use_rick_lin_ranks=True)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

In [90]:
plot(rs_lin_rick_right,rs_lin_rick_right_10)



In [91]:
plot(rs_lin_rick_right_10,rs_lin_rick_wrong_10)



In [36]:
#np.save('../../data/rs_lin_rick_right',rs_lin_rick_right.data)

In [128]:
rs0 = np.load('../../data/forest_rs_250_0.npy')
rs0_9 = np.load('../../data/forest_rs_250_0-9.npy')
rs10_20 = np.load('../../data/forest_rs_250_10-20.npy')
rs_forest = rs0_9.copy()
rs_forest[0,:,:] = rs0[0,:,:]
rs_forest[10:,:,:] = rs10_20[10:,:,:]
rs_forest = np.ma.array(rs_forest,mask=np.isnan(rs_forest))

In [129]:
plot(rs_lin_rick_right_10,rs_forest[:,:,:n_splits],labels=['quad','forest'])
plt.savefig('../../figures/quad_forest_vs_features.eps',format='eps')



In [97]:
lin_ranked_rick_drag = np.zeros((n_splits,21,9739)).astype(int) # Matrix to store the score rankings.  
rl = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=10,random_state=25,n_jobs=1)
n_obs = int(X_drag_sq.shape[0]/2) # Number of molecules.  
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in 
                                                                           # train/test_big that I put on GitHub
X_all = X_drag_sq    
for col in range(21):
    cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.  
    for j,(train,test) in enumerate(cv):
        print(descriptors[col],j)
        observed = Y[train,col]
        X = X_all[train,:-1] # Remove leak feature
        rl.fit(X,observed)
        lin_ranked_rick_drag[j,col,:] = np.argsort(rl.all_scores_.ravel())[::-1]


Intensity 0
Intensity 1
Intensity 2
Intensity 3
Intensity 4
Intensity 5
Intensity 6
Intensity 7
Intensity 8
Intensity 9
Intensity 10
Intensity 11
Intensity 12
Intensity 13
Intensity 14
Intensity 15
Intensity 16
Intensity 17
Intensity 18
Intensity 19
Intensity 20
Intensity 21
Intensity 22
Intensity 23
Intensity 24
Pleasantness 0
Pleasantness 1
Pleasantness 2
Pleasantness 3
Pleasantness 4
Pleasantness 5
Pleasantness 6
Pleasantness 7
Pleasantness 8
Pleasantness 9
Pleasantness 10
Pleasantness 11
Pleasantness 12
Pleasantness 13
Pleasantness 14
Pleasantness 15
Pleasantness 16
Pleasantness 17
Pleasantness 18
Pleasantness 19
Pleasantness 20
Pleasantness 21
Pleasantness 22
Pleasantness 23
Pleasantness 24
Bakery 0
Bakery 1
Bakery 2
Bakery 3
Bakery 4
Bakery 5
Bakery 6
Bakery 7
Bakery 8
Bakery 9
Bakery 10
Bakery 11
Bakery 12
Bakery 13
Bakery 14
Bakery 15
Bakery 16
Bakery 17
Bakery 18
Bakery 19
Bakery 20
Bakery 21
Bakery 22
Bakery 23
Bakery 24
Sweet 0
Sweet 1
Sweet 2
Sweet 3
Sweet 4
Sweet 5
Sweet 6
Sweet 7
Sweet 8
Sweet 9
Sweet 10
Sweet 11
Sweet 12
Sweet 13
Sweet 14
Sweet 15
Sweet 16
Sweet 17
Sweet 18
Sweet 19
Sweet 20
Sweet 21
Sweet 22
Sweet 23
Sweet 24
Fruit 0
Fruit 1
Fruit 2
Fruit 3
Fruit 4
Fruit 5
Fruit 6
Fruit 7
Fruit 8
Fruit 9
Fruit 10
Fruit 11
Fruit 12
Fruit 13
Fruit 14
Fruit 15
Fruit 16
Fruit 17
Fruit 18
Fruit 19
Fruit 20
Fruit 21
Fruit 22
Fruit 23
Fruit 24
Fish 0
Fish 1
Fish 2
Fish 3
Fish 4
Fish 5
Fish 6
Fish 7
Fish 8
Fish 9
Fish 10
Fish 11
Fish 12
Fish 13
Fish 14
Fish 15
Fish 16
Fish 17
Fish 18
Fish 19
Fish 20
Fish 21
Fish 22
Fish 23
Fish 24
Garlic 0
Garlic 1
Garlic 2
Garlic 3
Garlic 4
Garlic 5
Garlic 6
Garlic 7
Garlic 8
Garlic 9
Garlic 10
Garlic 11
Garlic 12
Garlic 13
Garlic 14
Garlic 15
Garlic 16
Garlic 17
Garlic 18
Garlic 19
Garlic 20
Garlic 21
Garlic 22
Garlic 23
Garlic 24
Spices 0
Spices 1
Spices 2
Spices 3
Spices 4
Spices 5
Spices 6
Spices 7
Spices 8
Spices 9
Spices 10
Spices 11
Spices 12
Spices 13
Spices 14
Spices 15
Spices 16
Spices 17
Spices 18
Spices 19
Spices 20
Spices 21
Spices 22
Spices 23
Spices 24
Cold 0
Cold 1
Cold 2
Cold 3
Cold 4
Cold 5
Cold 6
Cold 7
Cold 8
Cold 9
Cold 10
Cold 11
Cold 12
Cold 13
Cold 14
Cold 15
Cold 16
Cold 17
Cold 18
Cold 19
Cold 20
Cold 21
Cold 22
Cold 23
Cold 24
Sour 0
Sour 1
Sour 2
Sour 3
Sour 4
Sour 5
Sour 6
Sour 7
Sour 8
Sour 9
Sour 10
Sour 11
Sour 12
Sour 13
Sour 14
Sour 15
Sour 16
Sour 17
Sour 18
Sour 19
Sour 20
Sour 21
Sour 22
Sour 23
Sour 24
Burnt 0
Burnt 1
Burnt 2
Burnt 3
Burnt 4
Burnt 5
Burnt 6
Burnt 7
Burnt 8
Burnt 9
Burnt 10
Burnt 11
Burnt 12
Burnt 13
Burnt 14
Burnt 15
Burnt 16
Burnt 17
Burnt 18
Burnt 19
Burnt 20
Burnt 21
Burnt 22
Burnt 23
Burnt 24
Acid 0
Acid 1
Acid 2
Acid 3
Acid 4
Acid 5
Acid 6
Acid 7
Acid 8
Acid 9
Acid 10
Acid 11
Acid 12
Acid 13
Acid 14
Acid 15
Acid 16
Acid 17
Acid 18
Acid 19
Acid 20
Acid 21
Acid 22
Acid 23
Acid 24
Warm 0
Warm 1
Warm 2
Warm 3
Warm 4
Warm 5
Warm 6
Warm 7
Warm 8
Warm 9
Warm 10
Warm 11
Warm 12
Warm 13
Warm 14
Warm 15
Warm 16
Warm 17
Warm 18
Warm 19
Warm 20
Warm 21
Warm 22
Warm 23
Warm 24
Musky 0
Musky 1
Musky 2
Musky 3
Musky 4
Musky 5
Musky 6
Musky 7
Musky 8
Musky 9
Musky 10
Musky 11
Musky 12
Musky 13
Musky 14
Musky 15
Musky 16
Musky 17
Musky 18
Musky 19
Musky 20
Musky 21
Musky 22
Musky 23
Musky 24
Sweaty 0
Sweaty 1
Sweaty 2
Sweaty 3
Sweaty 4
Sweaty 5
Sweaty 6
Sweaty 7
Sweaty 8
Sweaty 9
Sweaty 10
Sweaty 11
Sweaty 12
Sweaty 13
Sweaty 14
Sweaty 15
Sweaty 16
Sweaty 17
Sweaty 18
Sweaty 19
Sweaty 20
Sweaty 21
Sweaty 22
Sweaty 23
Sweaty 24
Ammonia 0
Ammonia 1
Ammonia 2
Ammonia 3
Ammonia 4
Ammonia 5
Ammonia 6
Ammonia 7
Ammonia 8
Ammonia 9
Ammonia 10
Ammonia 11
Ammonia 12
Ammonia 13
Ammonia 14
Ammonia 15
Ammonia 16
Ammonia 17
Ammonia 18
Ammonia 19
Ammonia 20
Ammonia 21
Ammonia 22
Ammonia 23
Ammonia 24
Decayed 0
Decayed 1
Decayed 2
Decayed 3
Decayed 4
Decayed 5
Decayed 6
Decayed 7
Decayed 8
Decayed 9
Decayed 10
Decayed 11
Decayed 12
Decayed 13
Decayed 14
Decayed 15
Decayed 16
Decayed 17
Decayed 18
Decayed 19
Decayed 20
Decayed 21
Decayed 22
Decayed 23
Decayed 24
Wood 0
Wood 1
Wood 2
Wood 3
Wood 4
Wood 5
Wood 6
Wood 7
Wood 8
Wood 9
Wood 10
Wood 11
Wood 12
Wood 13
Wood 14
Wood 15
Wood 16
Wood 17
Wood 18
Wood 19
Wood 20
Wood 21
Wood 22
Wood 23
Wood 24
Grass 0
Grass 1
Grass 2
Grass 3
Grass 4
Grass 5
Grass 6
Grass 7
Grass 8
Grass 9
Grass 10
Grass 11
Grass 12
Grass 13
Grass 14
Grass 15
Grass 16
Grass 17
Grass 18
Grass 19
Grass 20
Grass 21
Grass 22
Grass 23
Grass 24
Flower 0
Flower 1
Flower 2
Flower 3
Flower 4
Flower 5
Flower 6
Flower 7
Flower 8
Flower 9
Flower 10
Flower 11
Flower 12
Flower 13
Flower 14
Flower 15
Flower 16
Flower 17
Flower 18
Flower 19
Flower 20
Flower 21
Flower 22
Flower 23
Flower 24
Chemical 0
Chemical 1
Chemical 2
Chemical 3
Chemical 4
Chemical 5
Chemical 6
Chemical 7
Chemical 8
Chemical 9
Chemical 10
Chemical 11
Chemical 12
Chemical 13
Chemical 14
Chemical 15
Chemical 16
Chemical 17
Chemical 18
Chemical 19
Chemical 20
Chemical 21
Chemical 22
Chemical 23
Chemical 24

In [95]:
n_splits = 25

In [119]:
def master_cv(X_all,Y,n_estimators,n_splits=25,model='rfc',alpha=10.0,random_state=0,feature_list=slice(None)):
    rs = np.zeros((21,n_splits))
    n_obs = int(X_all.shape[0]/2)
    shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This random state *must* be zero.  
    
    for col in range(21):
        print(col)
        X = X_all[:,:-1] # Remove high/low dilution feature.
        observed = Y[:,col]
        n_features_ = list(np.array(n_features)+(col==0))
        cv = DoubleSS(shuffle_split, col, X_all[:,-2])
        for j,(train,test) in enumerate(cv):
            #print(col,j)
            if model == 'rfc':
                if col==0:
                    est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features[col], max_depth=max_depth[col], 
                                            min_samples_leaf=min_samples_leaf[col],
                                            n_jobs=8,random_state=0)     
                else:
                    est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features[col], max_depth=max_depth[col], 
                                            min_samples_leaf=min_samples_leaf[col],
                                            oob_score=False,n_jobs=8,random_state=0)
            elif model == 'ridge':
                est = Ridge(alpha=alpha,random_state=random_state)
            est.fit(X[train,:][:,feature_list[j,col,:]],observed[train])
            predicted = est.predict(X[test,:][:,feature_list[j,col,:]])
            rs[col,j] = np.corrcoef(predicted,observed[test])[1,0]

        mean = rs[col,:].mean()
        sem = rs[col,:].std()/np.sqrt(n_splits)
        print(('Desc. %d: %.3f' % (col,mean)))
    return rs

In [125]:
rs_lin_drag = master_cv(X_drag_sq,Y,None,n_splits=25,model='ridge',alpha=10.0,feature_list=lin_ranked_rick_drag[:,:,:1000])


0
Desc. 0: 0.605
1
Desc. 1: 0.519
2
Desc. 2: 0.410
3
Desc. 3: 0.528
4
Desc. 4: 0.580
5
Desc. 5: 0.485
6
Desc. 6: 0.577
7
Desc. 7: 0.425
8
Desc. 8: 0.375
9
Desc. 9: 0.409
10
Desc. 10: 0.501
11
Desc. 11: 0.327
12
Desc. 12: 0.260
13
Desc. 13: 0.402
14
Desc. 14: 0.388
15
Desc. 15: 0.302
16
Desc. 16: 0.448
17
Desc. 17: 0.379
18
Desc. 18: 0.332
19
Desc. 19: 0.511
20
Desc. 20: 0.503

In [126]:
rs_lin_drag_morg = master_cv(X_drag_morg_sq,Y,None,n_splits=25,model='ridge',alpha=10.0,feature_list=lin_ranked_rick[:,:,:1000])


0
Desc. 0: 0.646
1
Desc. 1: 0.566
2
Desc. 2: 0.474
3
Desc. 3: 0.610
4
Desc. 4: 0.657
5
Desc. 5: 0.512
6
Desc. 6: 0.577
7
Desc. 7: 0.446
8
Desc. 8: 0.363
9
Desc. 9: 0.423
10
Desc. 10: 0.507
11
Desc. 11: 0.379
12
Desc. 12: 0.283
13
Desc. 13: 0.433
14
Desc. 14: 0.427
15
Desc. 15: 0.351
16
Desc. 16: 0.481
17
Desc. 17: 0.424
18
Desc. 18: 0.441
19
Desc. 19: 0.513
20
Desc. 20: 0.532

In [123]:
from scipy.stats import ttest_rel

wit = [rs_lin_drag_morg.copy(),rs_lin_drag_morg.copy()]
wout = [rs_lin_drag.copy(),rs_lin_drag.copy()]
diff = [wit[i].mean(axis=1) - wout[i].mean(axis=1) for i in range(2)]
order = [np.argsort(diff[i])[::-1] for i in range(2)]
ts,ps = [np.zeros(21),np.zeros(21)],[np.zeros(21),np.zeros(21)]

# Compute p-values and sort by effect size
for i in range(2):
    for j in range(21):
        ts[i][j],ps[i][j] = ttest_rel(wit[i][j,:],wout[i][j,:])
        ps[i][j] = ps[i][j]*21/(np.argsort(ps[i][j])+1) # FDR correction
    ts[i] = ts[i][order[i]]
    ps[i] = ps[i][order[i]]
    diff[i] = diff[i][order[i]]
    wit[i] = wit[i][order[i],:]
    wout[i] = wout[i][order[i],:]
                                     
fig,ax = plt.subplots(4,1,figsize=(15,16),sharex=True)
yerr = [(wit[i] - wout[i]).std(axis=1)/np.sqrt(n_splits) for i in range(2)]
yerr1 = [[yerr[i][j] if diff[i][j]>=0 else np.nan for j in range(21)] for i in range(2)]
yerr2 = [[yerr[i][j] if diff[i][j]<0 else np.nan for j in range(21)] for i in range(2)]

for i in range(2):
    ax[2*i].bar(np.arange(21),wit[i].mean(axis=1),color='r',yerr=yerr1[i],
          error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='+morgan')
    ax[2*i].bar(np.arange(21),wout[i].mean(axis=1),color='b',yerr=yerr2[i],
          error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='--morgan')
    ax[2*i+1].bar(np.arange(21),
              diff[i],
              yerr=yerr[i],color='k',#width=0.4,
              error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),
              alpha=0.5)
    for j in range(21):
        if ps[i][j] < 0.001:
            star = '***'
        elif ps[i][j] < 0.01:
            star = '**'
        elif ps[i][j] < 0.05:
            star = '*'
        else:
            star = ''
        ax[2*i+1].text(j+0.42,diff[i][j]+(-1 if diff[i][j]<0 else 1)*(yerr[i][j]+0.015),star,
                       size=12,horizontalalignment='center',verticalalignment='center')
    ax[2*i].legend(fontsize=12)
    ax[2*i].set_ylim(0,1)
    ax[2*i+1].set_ylim(diff[i].min()-0.08,diff[i].max()+0.08)
    ax[2*i].set_ylabel('Model/Data\nCorrelation')
    ax[2*i+1].set_ylabel('Correlation increase\nwith Morgan')
    ax[2*i+1].yaxis.labelpad = 25
    ax[2*1+1].set_yticks(np.arange(-0.1,0.3,0.1))

for i in range(4):
    ax[i].set_xlim(0,21)
    ax[i].set_xticks(np.arange(21)+0.4)
    ax[i].set_xticklabels([_.split('/')[0].lower() for _ in [descriptors[j] for j in order[int(i/2)]]],rotation=90)
plt.tight_layout()

"""
plt.figure(figsize=(10,10))
plt.errorbar(wout.mean(axis=1),
             wit.mean(axis=1),
             xerr=wout.std(axis=1)/np.sqrt(n_splits),
             yerr=wit.std(axis=1)/np.sqrt(n_splits),
             fmt='o',markersize=0)
for j in range(21):
    plt.text(wout.mean(axis=1)[j],wit.mean(axis=1)[j],letters[j],
             fontdict={'size':14,'weight':'bold'},horizontalalignment='center',verticalalignment='center')
plt.plot([0,1],[0,1],'--')
plt.xlabel('without Morgan')
plt.ylabel('with Morgan');    
""";
plt.savefig('/Users/rgerkin/Desktop/with_without_morgan.eps',format='eps')



In [127]:
from scipy.stats import ttest_rel

wit = [rs_lin_drag_morg.copy(),rs_lin_drag_morg.copy()]
wout = [rs_lin_drag.copy(),rs_lin_drag.copy()]
diff = [wit[i].mean(axis=1) - wout[i].mean(axis=1) for i in range(2)]
order = [np.argsort(diff[i])[::-1] for i in range(2)]
ts,ps = [np.zeros(21),np.zeros(21)],[np.zeros(21),np.zeros(21)]

# Compute p-values and sort by effect size
for i in range(2):
    for j in range(21):
        ts[i][j],ps[i][j] = ttest_rel(wit[i][j,:],wout[i][j,:])
        ps[i][j] = ps[i][j]*21/(np.argsort(ps[i][j])+1) # FDR correction
    ts[i] = ts[i][order[i]]
    ps[i] = ps[i][order[i]]
    diff[i] = diff[i][order[i]]
    wit[i] = wit[i][order[i],:]
    wout[i] = wout[i][order[i],:]
                                     
fig,ax = plt.subplots(4,1,figsize=(15,16),sharex=True)
yerr = [(wit[i] - wout[i]).std(axis=1)/np.sqrt(n_splits) for i in range(2)]
yerr1 = [[yerr[i][j] if diff[i][j]>=0 else np.nan for j in range(21)] for i in range(2)]
yerr2 = [[yerr[i][j] if diff[i][j]<0 else np.nan for j in range(21)] for i in range(2)]

for i in range(2):
    ax[2*i].bar(np.arange(21),wit[i].mean(axis=1),color='r',yerr=yerr1[i],
          error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='+morgan')
    ax[2*i].bar(np.arange(21),wout[i].mean(axis=1),color='b',yerr=yerr2[i],
          error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),alpha=0.5,label='--morgan')
    ax[2*i+1].bar(np.arange(21),
              diff[i],
              yerr=yerr[i],color='k',#width=0.4,
              error_kw=dict(ecolor='gray', lw=2, capsize=5, capthick=2),
              alpha=0.5)
    for j in range(21):
        if ps[i][j] < 0.001:
            star = '***'
        elif ps[i][j] < 0.01:
            star = '**'
        elif ps[i][j] < 0.05:
            star = '*'
        else:
            star = ''
        ax[2*i+1].text(j+0.42,diff[i][j]+(-1 if diff[i][j]<0 else 1)*(yerr[i][j]+0.015),star,
                       size=12,horizontalalignment='center',verticalalignment='center')
    ax[2*i].legend(fontsize=12)
    ax[2*i].set_ylim(0,1)
    ax[2*i+1].set_ylim(diff[i].min()-0.08,diff[i].max()+0.08)
    ax[2*i].set_ylabel('Model/Data\nCorrelation')
    ax[2*i+1].set_ylabel('Correlation increase\nwith Morgan')
    ax[2*i+1].yaxis.labelpad = 25
    ax[2*1+1].set_yticks(np.arange(-0.1,0.3,0.1))

for i in range(4):
    ax[i].set_xlim(0,21)
    ax[i].set_xticks(np.arange(21)+0.4)
    ax[i].set_xticklabels([_.split('/')[0].lower() for _ in [descriptors[j] for j in order[int(i/2)]]],rotation=90)
plt.tight_layout()

"""
plt.figure(figsize=(10,10))
plt.errorbar(wout.mean(axis=1),
             wit.mean(axis=1),
             xerr=wout.std(axis=1)/np.sqrt(n_splits),
             yerr=wit.std(axis=1)/np.sqrt(n_splits),
             fmt='o',markersize=0)
for j in range(21):
    plt.text(wout.mean(axis=1)[j],wit.mean(axis=1)[j],letters[j],
             fontdict={'size':14,'weight':'bold'},horizontalalignment='center',verticalalignment='center')
plt.plot([0,1],[0,1],'--')
plt.xlabel('without Morgan')
plt.ylabel('with Morgan');    
""";
plt.savefig('/Users/rgerkin/Desktop/with_without_morgan.eps',format='eps')



In [ ]: