4.5ppm setting

No warning that there are too few retention correction groups. Not too many peak-data insertion problems


In [1]:
import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

from scipy import interp

%matplotlib inline

In [2]:
def remove_zero_columns(X, threshold=1e-20):
    # convert zeros to nan, drop all nan columns, the replace leftover nan with zeros
    X_non_zero_colum = X.replace(0, np.nan).dropna(how='all', axis=1).replace(np.nan, 0)
    #.dropna(how='all', axis=0).replace(np.nan,0)
    return X_non_zero_colum

def zero_fill_half_min(X, threshold=1e-20):
    # Fill zeros with 1/2 the minimum value of that column
    # input dataframe. Add only to zero values
    
    # Get a vector of 1/2 minimum values
    half_min = X[X > threshold].min(axis=0)*0.5
    
    # Add the half_min values to a dataframe where everything that isn't zero is NaN.
    # then convert NaN's to 0
    fill_vals = (X[X < threshold] + half_min).fillna(value=0)
    
    # Add the original dataframe to the dataframe of zeros and fill-values
    X_zeros_filled = X + fill_vals
    return X_zeros_filled



toy = pd.DataFrame([[1,2,3,0],
               [0,0,0,0],
               [0.5,1,0,0]], dtype=float)

toy_no_zeros = remove_zero_columns(toy)
toy_filled_zeros = zero_fill_half_min(toy_no_zeros)
print toy
print toy_no_zeros
print toy_filled_zeros


     0  1  2  3
0  1.0  2  3  0
1  0.0  0  0  0
2  0.5  1  0  0
     0  1  2
0  1.0  2  3
1  0.0  0  0
2  0.5  1  0
      0    1    2
0  1.00  2.0  3.0
1  0.25  0.5  1.5
2  0.50  1.0  1.5

In [ ]:

Import the dataframe and remove any features that are all zero


In [3]:
### Subdivide the data into a feature table
data_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/processed/MTBLS315/'\
'uhplc_pos/xcms_result_4.5.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path, index_col=0)
df.shape
df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
df
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 
               'npeaks', 'uhplc_pos', 
               ]
samples_list = df.columns.difference(not_samples)
mz_rt_df = df[not_samples]

# convert to samples x features
X_df_raw = df[samples_list].T
# Remove zero-full columns and fill zeroes with 1/2 minimum values
X_df = remove_zero_columns(X_df_raw)
X_df_zero_filled = zero_fill_half_min(X_df)

print "original shape: %s \n# zeros: %f\n" % (X_df_raw.shape, (X_df_raw < 1e-20).sum().sum())
print "zero-columns repalced? shape: %s \n# zeros: %f\n" % (X_df.shape, 
                                                         (X_df < 1e-20).sum().sum())
print "zeros filled shape: %s \n#zeros: %f\n" % (X_df_zero_filled.shape, 
                                              (X_df_zero_filled < 1e-20).sum().sum())


# Convert to numpy matrix to play nicely with sklearn
X = X_df.as_matrix()
print X.shape


original shape: (61, 6984) 
# zeros: 3816.000000

zero-columns repalced? shape: (61, 6984) 
# zeros: 3816.000000

zeros filled shape: (61, 6984) 
#zeros: 0.000000

(61, 6984)

Get mappings between sample names, file names, and sample classes


In [4]:
# Get mapping between sample name and assay names
path_sample_name_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                'MTBLS315/metadata/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt'
# Index is the sample name
sample_df = pd.read_csv(path_sample_name_map, 
                        sep='\t', index_col=0)
sample_df = sample_df['MS Assay Name']
sample_df.shape
print sample_df.head(10)

# get mapping between sample name and sample class
path_sample_class_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                        'MTBLS315/metadata/s_NMFI and BSI diagnosis.txt'
class_df = pd.read_csv(path_sample_class_map,
                      sep='\t')
# Set index as sample name
class_df.set_index('Sample Name', inplace=True)
class_df = class_df['Factor Value[patient group]']
print class_df.head(10)

# convert all non-malarial classes into a single classes 
# (collapse non-malarial febril illness and bacteremia together)
class_map_df = pd.concat([sample_df, class_df], axis=1)
class_map_df.rename(columns={'Factor Value[patient group]': 'class'}, inplace=True)
class_map_df

binary_class_map = class_map_df.replace(to_replace=['non-malarial febrile illness', 'bacterial bloodstream infection' ], 
                                        value='non-malarial fever')

binary_class_map


Sample Name
MCMA429    1001_P
MCMA430    1002_P
MCMA431    1003_P
MCMA433    1004_P
MCMA434    1005_P
MCMA435    1006_P
MCMA436    1007_P
MCMA442    1009_P
MCMA443    1010_P
MCMA444    1011_P
Name: MS Assay Name, dtype: object
Sample Name
MCMA429                            malaria
MCMA430                            malaria
MCMA431                            malaria
MCMA433                            malaria
MCMA434       non-malarial febrile illness
MCMA435                            malaria
MCMA436    bacterial bloodstream infection
MCMA442                            malaria
MCMA443       non-malarial febrile illness
MCMA444    bacterial bloodstream infection
Name: Factor Value[patient group], dtype: object
Out[4]:
MS Assay Name class
Sample Name
MCMA429 1001_P malaria
MCMA430 1002_P malaria
MCMA431 1003_P malaria
MCMA433 1004_P malaria
MCMA434 1005_P non-malarial fever
MCMA435 1006_P malaria
MCMA436 1007_P non-malarial fever
MCMA442 1009_P malaria
MCMA443 1010_P non-malarial fever
MCMA444 1011_P non-malarial fever
MCMA446 1012_P non-malarial fever
MCMA447 1013_P malaria
MCMA448 1014_P malaria
MCMA450 1016_P malaria
MCMA451 1017_P malaria
MCMA452 1018_P malaria
MCMA455 1019_P non-malarial fever
MCMA458 1020_P non-malarial fever
MCMA459 1021_P non-malarial fever
MCMA462 1023_P non-malarial fever
MCMA463 1024_P non-malarial fever
MCMA465 1026_P non-malarial fever
MCMA466 1027_P malaria
MCMA472 1029_P malaria
MCMA474 1030_P malaria
MCMA477 1031_P malaria
MCMA478 1032_P non-malarial fever
MCMA480 1033_P non-malarial fever
MCMA484 1034_P non-malarial fever
MCMA489 1035_P malaria
... ... ...
MCMA493 1037_P malaria
MCMA495 1038_P malaria
MCMA496 1039_P malaria
MCMA497 1040_P malaria
MCMA501 1041_P malaria
MCMA502 1042_P malaria
MCMA503 1043_P non-malarial fever
MCMA513 1044_P malaria
MCMA515 1045_P non-malarial fever
MCMA516 1046_P malaria
MCMA523 1048_P malaria
MCMA526 1049_P non-malarial fever
MCMA528 1050_P non-malarial fever
MCMA533 1051_P malaria
MCMA534 1052_P non-malarial fever
MCMA535 1053_P non-malarial fever
MCMA537 1054_P non-malarial fever
MCMA538 1055_P non-malarial fever
MCMA550 1056_P malaria
MCMA557 1057_P non-malarial fever
MCMA560 1058_P malaria
MCMA562 1059_P malaria
MCMA564 1060_P non-malarial fever
MCMA565 1061_P non-malarial fever
MCMA566 1062_P non-malarial fever
MURB082 1064_P malaria
MURB085 1065_P malaria
MURB088 1066_P malaria
MURB092 1067_P malaria
MURB095 1068_P malaria

61 rows × 2 columns


In [5]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(binary_class_map['class'])
y = le.transform(binary_class_map['class'])

Plot the distribution of classification accuracy across multiple cross-validation splits - Kinda Dumb

Turns out doing this is kind of dumb, because you're not taking into account the prediction score your classifier assigned. Use AUC's instead. You want to give your classifier a lower score if it is really confident and wrong, than vice-versa


In [48]:
def rf_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                 n_estimators=1000):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, 
                                           random_state=random_state)
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

rf_violinplot(X,y)


# TODO - Switch to using caret for this bs..?



In [49]:
# Do multi-fold cross validation for adaboost classifier
def adaboost_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                       n_estimators=200):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
    clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

adaboost_violinplot(X,y)



In [7]:
# TODO PQN normalization, and log-transformation, 
# and some feature selection (above certain threshold of intensity, use principal components), et

def pqn_normalize(X, integral_first=False, plot=False):
    '''
    Take a feature table and run PQN normalization on it
    '''
    # normalize by sum of intensities in each sample first. Not necessary
    if integral_first: 
        sample_sums = np.sum(X, axis=1)
        X = (X / sample_sums[:,np.newaxis])
    
    # Get the median value of each feature across all samples
    mean_intensities = np.median(X, axis=0)
    
    # Divde each feature by the median value of each feature - 
    # these are the quotients for each feature
    X_quotients = (X / mean_intensities[np.newaxis,:])
    
    if plot: # plot the distribution of quotients from one sample
        for i in range(1,len(X_quotients[:,1])):
            print 'allquotients reshaped!\n\n', 
            #all_quotients = X_quotients.reshape(np.prod(X_quotients.shape))
            all_quotients = X_quotients[i,:]
            print all_quotients.shape
            x = np.random.normal(loc=0, scale=1, size=len(all_quotients))
            sns.violinplot(all_quotients)
            plt.title("median val: %f\nMax val=%f" % (np.median(all_quotients), np.max(all_quotients)))
            plt.plot( title="median val: ")#%f" % np.median(all_quotients))
            plt.xlim([-0.5, 5])
            plt.show()

    # Define a quotient for each sample as the median of the feature-specific quotients
    # in that sample
    sample_quotients = np.median(X_quotients, axis=1)
    
    # Quotient normalize each samples
    X_pqn = X / sample_quotients[:,np.newaxis]
    return X_pqn

# Make a fake sample, with 2 samples at 1x and 2x dilutions
X_toy = np.array([[1,1,1,],
                  [2,2,2],
                  [3,6,9],
                  [6,12,18]], dtype=float)
print X_toy
print X_toy.reshape(1, np.prod(X_toy.shape))
X_toy_pqn_int = pqn_normalize(X_toy, integral_first=True, plot=True)
print X_toy_pqn_int

print '\n\n\n'
X_toy_pqn = pqn_normalize(X_toy)
print X_toy_pqn


[[  1.   1.   1.]
 [  2.   2.   2.]
 [  3.   6.   9.]
 [  6.  12.  18.]]
[[  1.   1.   1.   2.   2.   2.   3.   6.   9.   6.  12.  18.]]
allquotients reshaped!

(3,)
/usr/local/lib/python2.7/dist-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
allquotients reshaped!

(3,)
allquotients reshaped!

(3,)
[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.16666667  0.33333333  0.5       ]
 [ 0.16666667  0.33333333  0.5       ]]




[[ 4.  4.  4.]
 [ 4.  4.  4.]
 [ 2.  4.  6.]
 [ 2.  4.  6.]]

pqn normalize your features


In [8]:
X_pqn = pqn_normalize(X)
print X_pqn


[[ 8118872.40563602   982190.73052249   134642.20612582 ...,
    178714.83481909    66527.85588346        0.        ]
 [  225283.00290818   675027.74568417    91685.23840574 ...,
    155237.74511389    45592.4866388         0.        ]
 [ 5847691.20062829   599940.68571859   102797.37612094 ...,
   2255253.71303898   646599.75750905   154746.42816362]
 ..., 
 [   81463.26730565   351762.08185577    47419.66490118 ...,
   3493131.96541859   972834.49191802   193677.99290531]
 [ 5285256.6154747    545992.38560526    81409.55729641 ...,
   2343422.5163654    699524.11901094   142133.95475263]
 [ 5056165.33272468   548415.14221525   533923.80216375 ...,
   7809162.87664314  2090111.28730953   431778.20362756]]

Random Forest & adaBoost with PQN-normalized data


In [231]:
rf_violinplot(X_pqn, y)



In [232]:
# Do multi-fold cross validation for adaboost classifier
adaboost_violinplot(X_pqn, y)


RF & adaBoost with PQN-normalized, log-transformed data

Turns out a monotonic transformation doesn't really affect any of these things. I guess they're already close to unit varinace...?


In [234]:
X_pqn_nlog = np.log(X_pqn)
rf_violinplot(X_pqn_nlog, y)



In [235]:
adaboost_violinplot(X_pqn_nlog, y)



In [11]:
def roc_curve_cv(X, y, clf, cross_val,
                path='/home/irockafe/Desktop/roc.pdf',
                save=False, plot=True):  
    t1 = time.time()
    # collect vals for the ROC curves
    tpr_list = []
    mean_fpr = np.linspace(0,1,100)
    auc_list = []
    
    # Get the false-positive and true-positive rate
    for i, (train, test) in enumerate(cross_val):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])[:,1]
        
        # get fpr, tpr
        fpr, tpr, thresholds = roc_curve(y[test], y_pred)
        roc_auc = auc(fpr, tpr)
        #print 'AUC', roc_auc
        #sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
        #sns.plt.show()
        tpr_list.append(interp(mean_fpr, fpr, tpr))
        tpr_list[-1][0] = 0.0
        auc_list.append(roc_auc)
        
        if (i % 10 == 0):
            print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_iter, time=(time.time() - t1))
        
            
        
        
    # get mean tpr and fpr
    mean_tpr = np.mean(tpr_list, axis=0)
    # make sure it ends up at 1.0
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc_list)
    
    if plot:
        # plot mean auc
        plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc, 
                                                                                       std_auc),
                    lw=5, color='b')

        # plot luck-line
        plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                    label='Luck', alpha=0.5) 

        # plot 1-std
        std_tpr = np.std(tpr_list, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                            label=r'$\pm$ 1 stdev')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
            iters=cross_val.n_iter, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
                 )
        plt.legend(loc="lower right")

        if save:
            plt.savefig(path,  format='pdf')


        plt.show()
    return tpr_list, auc_list, mean_fpr

In [12]:
rf_estimators = 1000
n_iter = 3
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)
rf_graph_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/rf_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=rf_estimators, cv=n_iter)

print cross_val_rf.n_iter
print cross_val_rf.test_size

tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_pqn, y, clf_rf, cross_val_rf,
                                           path=rf_graph_path, save=False)


3
0.3
0.0% done! 2.09924697876s elapsed

In [13]:
# For adaboosted
n_iter = 3
test_size = 0.3
random_state = 1
adaboost_estimators = 200
adaboost_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/adaboost_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=adaboost_estimators, 
                                                                            cv=n_iter)


cross_val_adaboost = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf = AdaBoostClassifier(n_estimators=adaboost_estimators, random_state=random_state)
adaboost_tpr, adaboost_auc, adaboost_fpr = roc_curve_cv(X_pqn, y, clf, cross_val_adaboost,
                                                       path=adaboost_path)


0.0% done! 5.16996312141s elapsed

In [ ]:

Great, you can classify things. But make null models and do a sanity check to make sure you arent just classifying garbage


In [16]:
# Make a null model AUC curve

def make_null_model(X, y, clf, cross_val, random_state=1, num_shuffles=5, plot=True):
    '''
    Runs the true model, then sanity-checks by:
    
    Shuffles class labels and then builds cross-validated ROC curves from them.
    Compares true AUC vs. shuffled auc by t-test (assumes normality of AUC curve)
    '''
    null_aucs = []
    print y.shape
    print X.shape
    tpr_true, auc_true, fpr_true = roc_curve_cv(X, y, clf, cross_val)
    # shuffle y lots of times
    for i in range(0, num_shuffles):
        #Iterate through the shuffled y vals and repeat with appropriate params
        # Retain the auc vals for final plotting of distribution
        y_shuffle = shuffle(y)
        cross_val.y = y_shuffle
        cross_val.y_indices = y_shuffle
        print 'Number of differences b/t original and shuffle: %s' % (y == cross_val.y).sum()
        # Get auc values for number of iterations
        tpr, auc, fpr = roc_curve_cv(X, y_shuffle, clf, cross_val, plot=True)
        
        null_aucs.append(auc)
    
    
    #plot the outcome
    if plot:
        flattened_aucs = [j for i in null_aucs for j in i]
        my_dict = {'true_auc': auc_true, 'null_auc': flattened_aucs}
        df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
        df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                         value_name='auc', var_name='AUC_type')
        #print flattened_aucs
        sns.violinplot(x='AUC_type', y='auc',
            inner='points', data=df_tidy)
        # Plot distribution of AUC vals   
        plt.title("Distribution of aucs")
        #sns.plt.ylabel('count')
        plt.xlabel('AUC')
        #sns.plt.plot(auc_true, 0, color='red', markersize=10)
        plt.show()
    # Do a quick t-test to see if odds of randomly getting an AUC that good
    return auc_true, null_aucs

In [17]:
# Make a null model AUC curve & compare it to null-model

# Random forest magic!
rf_estimators = 1000
n_iter = 50
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

true_auc, all_aucs = make_null_model(X_pqn, y, clf_rf, cross_val_rf, num_shuffles=5)


(61,)
(61, 6984)
0.0% done! 2.35597205162s elapsed
20.0% done! 23.6080479622s elapsed
40.0% done! 44.22717309s elapsed
60.0% done! 68.1212849617s elapsed
80.0% done! 93.8263671398s elapsed
Number of differences b/t original and shuffle: 29
0.0% done! 2.33861112595s elapsed
20.0% done! 25.591173172s elapsed
40.0% done! 47.9045989513s elapsed
60.0% done! 68.4025800228s elapsed
80.0% done! 89.2007801533s elapsed
Number of differences b/t original and shuffle: 39
0.0% done! 2.10709905624s elapsed
20.0% done! 24.8863301277s elapsed
40.0% done! 46.2581169605s elapsed
60.0% done! 67.9512171745s elapsed
80.0% done! 88.8506979942s elapsed
Number of differences b/t original and shuffle: 29
0.0% done! 2.07453298569s elapsed
20.0% done! 22.6307518482s elapsed
40.0% done! 43.997317791s elapsed
60.0% done! 66.1913340092s elapsed
80.0% done! 86.6390628815s elapsed
Number of differences b/t original and shuffle: 25
0.0% done! 2.05713510513s elapsed
20.0% done! 22.4516680241s elapsed
40.0% done! 46.1893529892s elapsed
60.0% done! 69.9613699913s elapsed
80.0% done! 92.2264370918s elapsed
Number of differences b/t original and shuffle: 33
0.0% done! 2.04788899422s elapsed
20.0% done! 22.716892004s elapsed
40.0% done! 43.4195468426s elapsed
60.0% done! 63.9549958706s elapsed
80.0% done! 84.5669679642s elapsed

In [76]:
# make dataframe from true and false aucs
flattened_aucs = [j for i in all_aucs for j in i]
my_dict = {'true_auc': true_auc, 'null_auc': flattened_aucs}
df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                 value_name='auc', var_name='AUC_type')
print df_tidy.head()
#print flattened_aucs
sns.violinplot(x='AUC_type', y='auc',
    inner='points', data=df_tidy, bw=0.7)
plt.show()


   AUC_type       auc
0  true_auc  0.886364
1  true_auc  0.943182
2  true_auc  0.681818
3  true_auc  0.795455
4  true_auc  0.886364

Let's check out some PCA plots


In [9]:
from sklearn.decomposition import PCA

# Check PCA of things
def PCA_plot(X, y, n_components, plot_color, class_nums, class_names, title='PCA'):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit(X).transform(X)

    print zip(plot_color, class_nums, class_names)
    for color, i, target_name in zip(plot_color, class_nums, class_names):
        
        # plot one class at a time, first plot all classes y == 0
        #print color
        #print y == i
        xvals = X_pca[y == i, 0]
        print xvals.shape
        yvals = X_pca[y == i, 1]
        plt.scatter(xvals, yvals, color=color, alpha=0.8, label=target_name)

    plt.legend(bbox_to_anchor=(1.01,1), loc='upper left', shadow=False)#, scatterpoints=1)
    plt.title('PCA of Malaria data')
    plt.show()


PCA_plot(X_pqn, y, 2, ['red', 'blue'], [0,1], ['malaria', 'non-malaria fever'])
PCA_plot(X, y, 2, ['red', 'blue'], [0,1], ['malaria', 'non-malaria fever'])


[('red', 0, 'malaria'), ('blue', 1, 'non-malaria fever')]
(34,)
(27,)
[('red', 0, 'malaria'), ('blue', 1, 'non-malaria fever')]
(34,)
(27,)

What about with all thre classes?


In [10]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(class_map_df['class'])
y_three_class = le.transform(class_map_df['class'])
print class_map_df.head(10)
print y_three_class
print X.shape
print y_three_class.shape

y_labels = np.sort(class_map_df['class'].unique())
print y_labels
colors = ['green', 'red', 'blue']

print np.unique(y_three_class)
PCA_plot(X_pqn, y_three_class, 2, colors, np.unique(y_three_class), y_labels)
PCA_plot(X, y_three_class, 2, colors, np.unique(y_three_class), y_labels)


            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
[1 1 1 1 2 1 0 1 2 0 0 1 1 1 1 1 0 2 2 2 2 0 1 1 1 1 2 0 2 1 2 1 1 1 1 1 1
 2 1 0 1 1 0 0 1 2 0 2 2 1 0 1 1 2 0 2 1 1 1 1 1]
(61, 6984)
(61,)
['bacterial bloodstream infection' 'malaria' 'non-malarial febrile illness']
[0 1 2]
[('green', 0, 'bacterial bloodstream infection'), ('red', 1, 'malaria'), ('blue', 2, 'non-malarial febrile illness')]
(12,)
(34,)
(15,)
[('green', 0, 'bacterial bloodstream infection'), ('red', 1, 'malaria'), ('blue', 2, 'non-malarial febrile illness')]
(12,)
(34,)
(15,)