No retention correction, simply grouped at bw=2 after centWave peak-picking


In [1]:
import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

from scipy import interp

%matplotlib inline


/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
def remove_zero_columns(X, threshold=1e-20):
    # convert zeros to nan, drop all nan columns, the replace leftover nan with zeros
    X_non_zero_colum = X.replace(0, np.nan).dropna(how='all', axis=1).replace(np.nan, 0)
    #.dropna(how='all', axis=0).replace(np.nan,0)
    return X_non_zero_colum

def zero_fill_half_min(X, threshold=1e-20):
    # Fill zeros with 1/2 the minimum value of that column
    # input dataframe. Add only to zero values
    
    # Get a vector of 1/2 minimum values
    half_min = X[X > threshold].min(axis=0)*0.5
    
    # Add the half_min values to a dataframe where everything that isn't zero is NaN.
    # then convert NaN's to 0
    fill_vals = (X[X < threshold] + half_min).fillna(value=0)
    
    # Add the original dataframe to the dataframe of zeros and fill-values
    X_zeros_filled = X + fill_vals
    return X_zeros_filled



toy = pd.DataFrame([[1,2,3,0],
               [0,0,0,0],
               [0.5,1,0,0]], dtype=float)

toy_no_zeros = remove_zero_columns(toy)
toy_filled_zeros = zero_fill_half_min(toy_no_zeros)
print toy
print toy_no_zeros
print toy_filled_zeros


     0    1    2    3
0  1.0  2.0  3.0  0.0
1  0.0  0.0  0.0  0.0
2  0.5  1.0  0.0  0.0
     0    1    2
0  1.0  2.0  3.0
1  0.0  0.0  0.0
2  0.5  1.0  0.0
      0    1    2
0  1.00  2.0  3.0
1  0.25  0.5  1.5
2  0.50  1.0  1.5

Import the dataframe and remove any features that are all zero


In [3]:
### Subdivide the data into a feature table
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/'\
'projects'
data_path = local_path + '/revo_healthcare/data/processed/MTBLS72/positive_mode/'\
'mtbls_no_retcor_bw2.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path, index_col=0)
df.shape
df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx

# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 
               'npeaks', 'positive_mode', 
               ]
samples_list = df.columns.difference(not_samples)
mz_rt_df = df[not_samples]

# convert to samples x features
X_df_raw = df[samples_list].T
# Replace X's at the beginning of columns
new_idx = [i.replace('X', '') for i in X_df_raw.index]
X_df_raw.index = new_idx
# Remove zero-full columns and fill zeroes with 1/2 minimum values
X_df = remove_zero_columns(X_df_raw)
X_df_zero_filled = zero_fill_half_min(X_df)

print "original shape: %s \n# zeros: %f\n" % (X_df_raw.shape, (X_df_raw < 1e-20).sum().sum())
print "zero-columns and NaNs repalced? shape: %s \n# zeros: %f\n" % (X_df.shape, 
                                                         (X_df < 1e-20).sum().sum())
print "zeros filled shape: %s \n#zeros: %f\n" % (X_df_zero_filled.shape, 
                                              (X_df_zero_filled < 1e-20).sum().sum())


# Convert to numpy matrix to play nicely with sklearn
#X = X_df.as_matrix()
#print X.shape
#print X_df.head()


original shape: (254, 1417) 
# zeros: 0.000000

zero-columns and NaNs repalced? shape: (254, 1417) 
# zeros: 83441.000000

zeros filled shape: (254, 1417) 
#zeros: 0.000000

Get mappings between sample names, file names, and sample classes


In [4]:
# get mapping between sample name and sample class
path_sample_class_map = local_path + '/revo_healthcare/data/raw/MTBLS72/s_Plasma_AD_Lipidomics.txt'
class_column = 'Factor Value[Cognitive Status]'

class_df = pd.read_csv(path_sample_class_map,
                      sep='\t')
# Set index as sample name
class_df.set_index('Sample Name', inplace=True)
class_df = class_df[class_column]
# select only positive values
class_df = class_df[class_df.index.str.contains('POS')]
print class_df.head(10)
print "Class label shape: ", class_df.shape
print "feature table shape: ", X_df.shape

#class_df.rename('class', inplace=True)
print class_df.head()
pd.unique(class_df.values)


Sample Name
20120829_Howard_POS_129    aMCI/AD
20120829_Howard_POS_130    aMCI/AD
20120829_Howard_POS_131    aMCI/AD
20120829_Howard_POS_132    aMCI/AD
20120829_Howard_POS_134    aMCI/AD
20120829_Howard_POS_135    aMCI/AD
20120829_Howard_POS_137    aMCI/AD
20120829_Howard_POS_142    aMCI/AD
20120829_Howard_POS_158    aMCI/AD
20120829_Howard_POS_160    aMCI/AD
Name: Factor Value[Cognitive Status], dtype: object
Class label shape:  (323,)
feature table shape:  (254, 1417)
Sample Name
20120829_Howard_POS_129    aMCI/AD
20120829_Howard_POS_130    aMCI/AD
20120829_Howard_POS_131    aMCI/AD
20120829_Howard_POS_132    aMCI/AD
20120829_Howard_POS_134    aMCI/AD
Name: Factor Value[Cognitive Status], dtype: object
Out[4]:
array(['aMCI/AD', 'Normal Control', 'Not Applicable'], dtype=object)

Control is 0, alzheimers is 1

Not that the 'Not Applicable' class is not part of my dataframe, X, so when we make sure that X and y refer to the same samples, all the 'Not Applicable' samples are dropped


In [5]:
# Match between feature table and metadata and assert that they're in the same order.
# then define the numpy-arrays for X and y
class_df = class_df[X_df.index].sort_index()
X_df = X_df.sort_index()

assert (class_df.index == X_df.index).all()

X = X_df.as_matrix()

# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(class_df)
y = le.transform(class_df)
print y


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 1 1
 1 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1
 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0
 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1
 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

In [6]:
# TODO PQN normalization, and log-transformation, 
# and some feature selection (above certain threshold of intensity, use principal components), et

def pqn_normalize(X, integral_first=False, plot=False):
    '''
    Take a feature table and run PQN normalization on it
    '''
    # normalize by sum of intensities in each sample first. Not necessary
    if integral_first: 
        sample_sums = np.sum(X, axis=1)
        X = (X / sample_sums[:,np.newaxis])
    
    # Get the median value of each feature across all samples
    mean_intensities = np.median(X, axis=0)
    
    # Divde each feature by the median value of each feature - 
    # these are the quotients for each feature
    X_quotients = (X / mean_intensities[np.newaxis,:])
    
    if plot: # plot the distribution of quotients from one sample
        for i in range(1,len(X_quotients[:,1])):
            print 'allquotients reshaped!\n\n', 
            #all_quotients = X_quotients.reshape(np.prod(X_quotients.shape))
            all_quotients = X_quotients[i,:]
            print all_quotients.shape
            x = np.random.normal(loc=0, scale=1, size=len(all_quotients))
            sns.violinplot(all_quotients)
            plt.title("median val: %f\nMax val=%f" % (np.median(all_quotients), np.max(all_quotients)))
            plt.plot( title="median val: ")#%f" % np.median(all_quotients))
            plt.xlim([-0.5, 5])
            plt.show()

    # Define a quotient for each sample as the median of the feature-specific quotients
    # in that sample
    sample_quotients = np.median(X_quotients, axis=1)
    
    # Quotient normalize each samples
    X_pqn = X / sample_quotients[:,np.newaxis]
    return X_pqn

# Make a fake sample, with 2 samples at 1x and 2x dilutions
X_toy = np.array([[1,1,1,],
                  [2,2,2],
                  [3,6,9],
                  [6,12,18]], dtype=float)
print X_toy
print X_toy.reshape(1, np.prod(X_toy.shape))
X_toy_pqn_int = pqn_normalize(X_toy, integral_first=True, plot=True)
print X_toy_pqn_int

print '\n\n\n'
X_toy_pqn = pqn_normalize(X_toy)
print X_toy_pqn


[[  1.   1.   1.]
 [  2.   2.   2.]
 [  3.   6.   9.]
 [  6.  12.  18.]]
[[  1.   1.   1.   2.   2.   2.   3.   6.   9.   6.  12.  18.]]
allquotients reshaped!

(3,)
allquotients reshaped!

(3,)
allquotients reshaped!

(3,)
[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.16666667  0.33333333  0.5       ]
 [ 0.16666667  0.33333333  0.5       ]]




[[ 4.  4.  4.]
 [ 4.  4.  4.]
 [ 2.  4.  6.]
 [ 2.  4.  6.]]

pqn normalize your features


In [7]:
X_pqn = pqn_normalize(X)
print X_pqn


[[  2376.68354621   2632.79600443  13599.57019084 ...,   2316.63382316
    2019.66847599    446.83641138]
 [  2895.79115719   2982.04128079  15946.90497631 ...,   2190.68836264
    2925.0567903     645.62885272]
 [     0.           3992.6417963   16528.77236548 ...,   2220.2094898
    1848.15195494    623.7804676 ]
 ..., 
 [     0.              0.              0.         ...,      0.              0.
       0.        ]
 [  2337.76090654      0.          37674.85217975 ...,      0.
    2609.61532135    654.76842215]
 [     0.              0.          36787.00296048 ...,      0.              0.
       0.        ]]
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:38: RuntimeWarning: divide by zero encountered in divide
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:38: RuntimeWarning: invalid value encountered in divide

Random Forest & adaBoost with PQN-normalized data


In [8]:
def roc_curve_cv(X, y, clf, cross_val,
                path='/home/irockafe/Desktop/roc.pdf',
                save=False, plot=True):  
    t1 = time.time()
    # collect vals for the ROC curves
    tpr_list = []
    mean_fpr = np.linspace(0,1,100)
    auc_list = []
    
    # Get the false-positive and true-positive rate
    for i, (train, test) in enumerate(cross_val):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])[:,1]
        
        # get fpr, tpr
        fpr, tpr, thresholds = roc_curve(y[test], y_pred)
        roc_auc = auc(fpr, tpr)
        #print 'AUC', roc_auc
        #sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
        #sns.plt.show()
        tpr_list.append(interp(mean_fpr, fpr, tpr))
        tpr_list[-1][0] = 0.0
        auc_list.append(roc_auc)
        
        if (i % 10 == 0):
            print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_iter, time=(time.time() - t1))
        
            
        
        
    # get mean tpr and fpr
    mean_tpr = np.mean(tpr_list, axis=0)
    # make sure it ends up at 1.0
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc_list)
    
    if plot:
        # plot mean auc
        plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc, 
                                                                                       std_auc),
                    lw=5, color='b')

        # plot luck-line
        plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                    label='Luck', alpha=0.5) 

        # plot 1-std
        std_tpr = np.std(tpr_list, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                            label=r'$\pm$ 1 stdev')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
            iters=cross_val.n_iter, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
                 )
        plt.legend(loc="lower right")

        if save:
            plt.savefig(path,  format='pdf')


        plt.show()
    return tpr_list, auc_list, mean_fpr

In [22]:
rf_estimators = 1000
n_iter = 3
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

print cross_val_rf.n_iter
print cross_val_rf.test_size

tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X, y, clf_rf, cross_val_rf,
                                           save=False)


3
0.3
0.0% done! 3.16399288177s elapsed

In [23]:
# For adaboosted
n_iter = 3
test_size = 0.3
random_state = 1
adaboost_estimators = 200



cross_val_adaboost = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf = AdaBoostClassifier(n_estimators=adaboost_estimators, random_state=random_state)
adaboost_tpr, adaboost_auc, adaboost_fpr = roc_curve_cv(X, y, clf, cross_val_adaboost,
                                                       )


0.0% done! 4.0625231266s elapsed

Great, you can classify things. But make null models and do a sanity check to make sure you arent just classifying garbage


In [10]:
# Make a null model AUC curve

def make_null_model(X, y, clf, cross_val, random_state=1, num_shuffles=5, plot=True):
    '''
    Runs the true model, then sanity-checks by:
    
    Shuffles class labels and then builds cross-validated ROC curves from them.
    TODO: Compare true AUC vs. shuffled auc by t-test (assumes normality of AUC curve)
    '''
    null_aucs = []
    print y.shape
    print X.shape
    tpr_true, auc_true, fpr_true = roc_curve_cv(X, y, clf, cross_val, save=True)
    # shuffle y lots of times
    for i in range(0, num_shuffles):
        #Iterate through the shuffled y vals and repeat with appropriate params
        # Retain the auc vals for final plotting of distribution
        y_shuffle = shuffle(y)
        cross_val.y = y_shuffle
        cross_val.y_indices = y_shuffle
        print 'Number of differences b/t original and shuffle: %s' % (y == cross_val.y).sum()
        # Get auc values for number of iterations
        tpr, auc, fpr = roc_curve_cv(X, y_shuffle, clf, cross_val, plot=True)
        
        null_aucs.append(auc)
    
    
    #plot the outcome as violinplots of auc curves
    if plot:
        flattened_aucs = [j for i in null_aucs for j in i]
        my_dict = {'true_auc': auc_true, 'null_auc': flattened_aucs}
        df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
        df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                         value_name='auc', var_name='AUC_type')
        #print flattened_aucs
        sns.violinplot(x='AUC_type', y='auc',
            inner='points', data=df_tidy)
        # Plot distribution of AUC vals   
        plt.title("Distribution of aucs")
        #sns.plt.ylabel('count')
        plt.xlabel('AUC')
        #sns.plt.plot(auc_true, 0, color='red', markersize=10)
        plt.show()
    # Do a quick t-test to see if odds of randomly getting an AUC that good
    return auc_true, null_aucs

In [11]:
# Make a null model AUC curve & compare it to null-model

# Random forest magic!
rf_estimators = 1000
n_iter = 50
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

true_auc, all_aucs = make_null_model(X, y, clf_rf, cross_val_rf, num_shuffles=5)


(254,)
(254, 1417)
0.0% done! 4.45924210548s elapsed
20.0% done! 78.9431581497s elapsed
40.0% done! 143.782060146s elapsed
60.0% done! 188.889692068s elapsed
80.0% done! 230.896824121s elapsed
Number of differences b/t original and shuffle: 146
0.0% done! 4.78017687798s elapsed
20.0% done! 56.8655638695s elapsed
40.0% done! 105.429105043s elapsed
60.0% done! 150.889708042s elapsed
80.0% done! 197.193608046s elapsed
Number of differences b/t original and shuffle: 150
0.0% done! 4.63375210762s elapsed
20.0% done! 52.5110690594s elapsed
40.0% done! 102.090430021s elapsed
60.0% done! 151.704711914s elapsed
80.0% done! 202.616847992s elapsed
Number of differences b/t original and shuffle: 140
0.0% done! 4.52952599525s elapsed
20.0% done! 50.0040450096s elapsed
40.0% done! 93.9484059811s elapsed
60.0% done! 140.00563693s elapsed
80.0% done! 188.164286137s elapsed
Number of differences b/t original and shuffle: 142
0.0% done! 4.62021303177s elapsed
20.0% done! 55.9427940845s elapsed
40.0% done! 109.53305006s elapsed
60.0% done! 158.867068052s elapsed
80.0% done! 204.473369122s elapsed
Number of differences b/t original and shuffle: 154
0.0% done! 4.78358602524s elapsed
20.0% done! 52.0092451572s elapsed
40.0% done! 96.5998790264s elapsed
60.0% done! 141.035567045s elapsed
80.0% done! 189.470935106s elapsed

In [12]:
# make dataframe from true and false aucs
flattened_aucs = [j for i in all_aucs for j in i]
my_dict = {'true_auc': true_auc, 'null_auc': flattened_aucs}
df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                 value_name='auc', var_name='AUC_type')
print df_tidy.head()
#print flattened_aucs
sns.violinplot(x='AUC_type', y='auc',
    inner='points', data=df_tidy, bw=0.7)
plt.title("Classification is not possible when data is shuffled")
#sns.plt.ylabel('count')
plt.xlabel('True model vs. Null Model')
plt.ylabel('AUC')
#sns.plt.plot(auc_true, 0, color='red', markersize=10)
plt.savefig('/home/irockafe/Desktop/auc distribution', format='pdf')
plt.show()


   AUC_type       auc
0  true_auc  0.816116
1  true_auc  0.822727
2  true_auc  0.703306
3  true_auc  0.847107
4  true_auc  0.901653

Let's check out some PCA plots


In [63]:
from sklearn.decomposition import PCA

# Check PCA of things
def PCA_plot(X, y, n_components, plot_color, class_nums, class_names, title='PCA'):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit(X).transform(X)

    print zip(plot_color, class_nums, class_names)
    for color, i, target_name in zip(plot_color, class_nums, class_names):
        
        # plot one class at a time, first plot all classes y == 0
        #print color
        #print y == i
        xvals = X_pca[y == i, 0]
        print xvals.shape
        yvals = X_pca[y == i, 1]
        plt.scatter(xvals, yvals, color=color, alpha=0.8, label=target_name)

    plt.legend(bbox_to_anchor=(1.01,1), loc='upper left', shadow=False)#, scatterpoints=1)
    plt.title('PCA of Malaria data')
    plt.show()


PCA_plot(X, y, 2, ['red', 'blue'], [0,1], ['Control', 'Alzheimers'])
#PCA_plot(X, y, 2, ['red', 'blue'], [0,1], ['malaria', 'non-malaria fever'])


[('red', 0, 'Control'), ('blue', 1, 'Alzheimers')]
(182,)
(72,)