In [1]:
# Preliminaries to work with the data.   
# Assumes this notebook is in olfaction-prediction/opc_python/gerkin
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)

import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream
import numpy as np
import matplotlib.pyplot as plt
import pandas

In [4]:
descriptors = loading.get_descriptors()
all_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard')+loading.get_CIDs('testset'))
mdx_full = dream.get_molecular_data(['dragon','episuite','morgan','nspdk','gramian'],all_CIDs)
mdx_drag = dream.get_molecular_data(['dragon'],all_CIDs)
mdx_drag_morg = dream.get_molecular_data(['dragon','morgan'],all_CIDs)


Episuite has 62 features for 476 molecules.
Morgan has 2437 features for 476 molecules.
NSPDK has 6163 features for 476 molecules.
NSPDK Gramian has 2437 features for 476 molecules.
There are now 15969 total features.
There are now 4870 total features.
Morgan has 2437 features for 476 molecules.
There are now 7307 total features.

In [5]:
# Create the feature matrices from the feature dicts.  
from sklearn.preprocessing import Imputer,MinMaxScaler
#X_all,good1,good2,means,stds,imputer = dream.make_X(mdx_full,['training','leaderboard'])
X_drag,good1,good2,means,stds,imputer = dream.make_X(mdx_drag,['training','leaderboard'])
X_drag_morg,good1,good2,means,stds,imputer = dream.make_X(mdx_drag_morg,['training','leaderboard'])
def quad_prep(mdx):
    X_temp,_,_,_,_,_ = dream.make_X(mdx,['training','leaderboard'],raw=True)
    X_temp[np.isnan(X_temp)] = 0
    X_scaled = MinMaxScaler().fit_transform(X_temp[:,:-2])
    X_scaled_sq = np.hstack((X_scaled,X_scaled**2,X_temp[:,-2:]))
    return X_scaled_sq
X_drag_sq = quad_prep(mdx_drag)
X_drag_morg_sq = quad_prep(mdx_drag_morg)
Y_all,imputer = dream.make_Y_obs(['training','leaderboard'],target_dilution=None,imputer='mask')


The X matrix now has shape (814x3063) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x5497) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x4871) molecules by non-NaN good molecular descriptors
The X matrix now has shape (814x7308) molecules by non-NaN good molecular descriptors
The Y['mean_std'] matrix now has shape (814x42) molecules by 2 x perceptual descriptors
The Y['subject'] dict now has 49 matrices of shape (814x21) molecules by perceptual descriptors, one for each subject

In [53]:
import pandas as pd
linear_mean_noleak = pd.read_csv('../../data/2e_mean_noleak.csv')
linear_sem_noleak = pd.read_csv('../../data/2e_se_noleak.csv')

In [45]:
lin_importances = np.zeros((250,21,14613)) # 10 splits, 21 descriptors, 14613 features without the leak (7606*2 + negLogD)
lin_ranked = np.zeros((250,21,14613)).astype(int) # Matrix to store the score rankings.  
lin_features = pd.read_csv('../../data/linear_scores_and_features/features_dragon_morgan.csv',nrows=0)
for split in range(250):
    if split % 25 == 0:
        print(split)
    for desc in range(21):
        scores = pd.read_csv('../../data/linear_scores_and_features/LB_scores_morgan%d/scores_%d.csv' % (split,desc),index_col=0)
        if 'Intensity' in scores.index:
            scores.drop(['0','Intensity'],inplace=1) # Get rid of that weird "0" feature and the leak feature.  
        if 'neglog10d' not in scores.index:
            scores.loc[14612] = 1 # Add in the dilution if it isn't there.  Assume it is as important as possible.  
        scores['int'] = np.arange(14613).astype(int) # Temp index to use for ranking.  
        lin_ranked[split,desc,:] = scores.sort_values(by='0',ascending=0)['int'] # Sort and store the ranking.


0
25
50
75
100
125
150
175
200
225

In [46]:
lin_ranked[0,0,:] # The feature ranking for the first split, for intensity. 
                  # negLogD appears first, so presumably this is working.


Out[46]:
array([14612,  5847,  3552, ...,  5097,  5098,  7306])

In [47]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000]
n_splits = 10

# This class produces a new iterator that makes sure that the training and test set do not contains the same molecule
# at different dilutions, and also that the higher concentration is tested (or 10^-3 for intensity).  
class DoubleSS:
    def __init__(self, ss, col, concs):
        self.splits = ss
        self.col = col
        self.concs = concs

    def __iter__(self):
        for train, test in self.splits:
            train = np.concatenate((2*train,2*train+1))
            if self.col>0:
                test = 2*test+1 # The second (higher) concentration of the pair
            else:
                test = np.concatenate((2*test,2*test+1))
                test = test[self.concs[test]==-3]
            yield train, test
            
    def __len__(self):
        return len(self.splits)

Y = Y_all['mean_std'] # Use the perceptual means (second 21 columns are std's which are ignored here)

In [51]:
# Produces correlations for each split for each descriptor, for varying numbers of (ranked) features.
def feature_sweep(X_all,Y,n_estimators,n_splits=n_splits,n_features=n_features,model='rf',alpha=1.0,random_state=0):
    rs = np.zeros((21,len(n_features),n_splits)) # Empty matrix to store correlations.  
    n_obs = int(X_all.shape[0]/2) # Number of molecules.  
    shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This will produce the splits in 
                                                                               # train/test_big that I put on GitHub
    
    for col in range(21): # For each descriptor.  
        X = X_all[:,:-1] # Remove high/low dilution feature, i.e. remove the leak.
        observed = Y[:,col] # Perceptual data for this descriptor.  
        n_features_ = list(np.array(n_features)+(col==0))
        cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Produce the correct train and test indices.  
        for j,(train,test) in enumerate(cv):
            if j % 50 == 0:
                print(col,j)
            if model == 'rf': # If the model is random forest regression.  
                est = RandomForestRegressor(n_estimators=n_estimators,max_features='auto',
                                            oob_score=False,n_jobs=8,random_state=random_state)
            elif model == 'ridge': # If the model is ridge regression. 
                est = Ridge(alpha=alpha,fit_intercept=True, normalize=False, 
                            copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=random_state)
            est.fit(X[train,:],observed[train]) # Fit the model on the training data.  
            if model == 'rf':
                importance_ranks = np.argsort(est.feature_importances_)[::-1] # Use feature importances to get ranks.  
            elif model == 'ridge':
                importance_ranks = lin_ranked[j,col,:] # Use the pre-computed ranks.
            for i,max_features in enumerate(n_features_): # For each number of features to keep...
                est.fit(X[train,:][:,importance_ranks[:max_features]],
                        observed[train]) # Fit the model on the training data with max_features features.
                predicted = est.predict(X[test,:][:,importance_ranks[:max_features]]) # Predict the test data.  
                rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Compute the correlation coefficient.  

        means = rs[col,:,:].mean(axis=1) # Compute the mean correlation across splits.  
        sems = rs[col,:,:].std(axis=1)/np.sqrt(n_splits) # Compute the SEM of the correlation across splits.  
        print(('Desc. %d:'+len(n_features)*' [%.3f],') % \
              tuple([col]+[means[i] for i in range(len(n_features))])) # Print the results.  
    return rs

In [56]:
rs_250 = feature_sweep(X_drag_morg_sq,Y,None,n_splits=250,model='ridge',random_state=0) # Use the linear and squared terms, 
                                                     # and negLogD with ridge regression.  
np.save('../../data/gabor_rs_250',rs_250)
rs_10 = np.load('../../data/gabor_rs_10.npy')

In [59]:
assert np.array_equal(np.array(n_features),np.array(linear_mean_noleak['number of features']))
fig,axes = plt.subplots(3,7,sharex=False,sharey=True,figsize=(20,10))
for col,ax in enumerate(axes.flat):
    lin_means = linear_mean_noleak[descriptors[col]]
    ax.errorbar(n_features,lin_means,linear_sem_noleak[descriptors[col]],color='purple',label='lin_gabor')
    #ax.errorbar(n_features,rs_10[col].mean(axis=1),rs_10[col].std(axis=1)/np.sqrt(n_splits),color='red',label='lin_10')
    ax.errorbar(n_features,rs_250[col].mean(axis=1),rs_250[col].std(axis=1)/np.sqrt(n_splits),color='blue',label='lin_250')
    #ax.errorbar(n_features,forest[col].mean(axis=1),forest[col].std(axis=1)/np.sqrt(n_splits),color='green',label='forest')
    if col==0:
        ax.legend()
        handles, labels = ax.get_legend_handles_labels()
        lg = ax.legend(handles[1:], labels[1:], loc=4, fontsize=10)
        lg.draw_frame(False)
    ax.set_xlim(0.5,20000)
    ax.set_ylim(0,0.8)
    ax.set_yticks(np.linspace(0,0.6,4))
    ax.set_yticklabels([_ for _ in np.linspace(0,0.6,4)],size=20)
    ax.set_xticklabels(n_features,size=20)
    ax.set_xscale('log')
    descriptor = descriptors[col].split('/')[1 if col==1 else 0]
    descriptor = descriptor[0]+descriptor[1:].lower()
    ax.set_title(descriptor, size=25)
plt.tight_layout()
fig.text(0.5, -0.025, 'Number of features', ha='center', size=25)
fig.text(-0.02, 0.5, 'Correlation', va='center', rotation='vertical', size=25);



In [ ]: