Predicting Pathogen from RNAseq data


In [1]:
from math import pow
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.pipeline import Pipeline

random_state = 0xDEADCAFE

In [2]:
patient_groups=["control", "viral", "bacterial", "fungal"]
group_id = lambda name: patient_groups.index(name)

X = pd.DataFrame.from_csv("combineSV_WTcpmtable_v2.txt", sep="\s+").T
y = [group_id("bacterial")] * 29 \
    + [group_id("viral")] * 42 \
    + [group_id("fungal")] * 10 \
    + [group_id("control")] * 61

print "Complete data set has %d samples and %d features." % (X.shape[0], X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))


Complete data set has 142 samples and 25342 features.
Training set has 99 samples. Testing set has 43 samples.

In [4]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.linear_model import RandomizedLogisticRegression

parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
svc = LinearSVC(class_weight='balanced')
rfe = RFECV(estimator=svc, step=.02, cv=2, scoring='accuracy', n_jobs=1)
clf = GridSearchCV(rfe, parameters, scoring='accuracy', n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Fitting 3 folds for each of 29 candidates, totalling 87 fits
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  8.3min
[Parallel(n_jobs=8)]: Done  87 out of  87 | elapsed: 19.4min finished
Best: 0.515152 using {'estimator__C': 0.0009765625}
0.474747 (0.108626) with: {'estimator__C': 2.9802322387695312e-08}
0.444444 (0.135774) with: {'estimator__C': 5.960464477539063e-08}
0.484848 (0.101307) with: {'estimator__C': 1.1920928955078125e-07}
0.474747 (0.082402) with: {'estimator__C': 2.384185791015625e-07}
0.505051 (0.063877) with: {'estimator__C': 4.76837158203125e-07}
0.474747 (0.111932) with: {'estimator__C': 9.5367431640625e-07}
0.494949 (0.099272) with: {'estimator__C': 1.9073486328125e-06}
0.484848 (0.101307) with: {'estimator__C': 3.814697265625e-06}
0.454545 (0.135020) with: {'estimator__C': 7.62939453125e-06}
0.464646 (0.123226) with: {'estimator__C': 1.52587890625e-05}
0.494949 (0.091583) with: {'estimator__C': 3.0517578125e-05}
0.484848 (0.101307) with: {'estimator__C': 6.103515625e-05}
0.464646 (0.123226) with: {'estimator__C': 0.0001220703125}
0.505051 (0.083079) with: {'estimator__C': 0.000244140625}
0.454545 (0.135020) with: {'estimator__C': 0.00048828125}
0.515152 (0.076202) with: {'estimator__C': 0.0009765625}
0.474747 (0.111932) with: {'estimator__C': 0.001953125}
0.484848 (0.101307) with: {'estimator__C': 0.00390625}
0.505051 (0.083079) with: {'estimator__C': 0.0078125}
0.494949 (0.091583) with: {'estimator__C': 0.015625}
0.505051 (0.083079) with: {'estimator__C': 0.03125}
0.484848 (0.101307) with: {'estimator__C': 0.0625}
0.505051 (0.083079) with: {'estimator__C': 0.125}
0.494949 (0.099272) with: {'estimator__C': 0.25}
0.505051 (0.083079) with: {'estimator__C': 0.5}
0.464646 (0.123226) with: {'estimator__C': 1.0}
0.505051 (0.083079) with: {'estimator__C': 2.0}
0.494949 (0.091583) with: {'estimator__C': 4.0}
0.505051 (0.088178) with: {'estimator__C': 8.0}

In [5]:
best_estimator = clf.best_estimator_

print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X.shape[1] - best_estimator.n_features_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()


Optimal number of features : 2066
Recursive Feature Elimination (RFE) eliminated 23276 features

In [6]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt

def create_learning_curve(title, model):
    cv = ShuffleSplit(n_splits=2, test_size=0.3, random_state=random_state)
    plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
    
create_learning_curve("Learning Curves Full Feature Set", svc)
create_learning_curve("Learning Curves Limited Feature Set", best_estimator.estimator_)

plt.show()


None

Make predictions based on the model


In [7]:
from classification_metrics import classification_metrics

# make predictions
predicted = best_estimator.predict(X_test)

classification_metrics(y_test, predicted, patient_groups)


Accuracy was 62.79%

             precision    recall  f1-score   support

    control       0.62      0.83      0.71        18
      viral       0.88      0.44      0.58        16
  bacterial       0.45      0.62      0.53         8
     fungal       0.00      0.00      0.00         1

avg / total       0.67      0.63      0.61        43

Confusion Matrix: cols = predictions, rows = actual

                       control          viral      bacterial         fungal
        control             15              1              2              0
          viral              6              7              3              0
      bacterial              3              0              5              0
         fungal              0              0              1              0
/Users/matt/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Review model predictions


In [8]:
probs = np.array(best_estimator.predict(X_test))

d = {"Predicted": [patient_groups[i] for i in best_estimator.predict(X_test)],
     "Actual": [patient_groups[i] for i in y_test]}

patient_df = pd.DataFrame(d, index=X_test.index)
patient_df.sort_values(by="Actual")


Out[8]:
Actual Predicted
MNC.473 bacterial bacterial
MN_223 bacterial bacterial
MN_208 bacterial bacterial
MNC.557 bacterial control
MN_363 bacterial bacterial
MN_304 bacterial bacterial
MNC.214 bacterial control
MN_307 bacterial control
MNC.611 control control
MN_366 control control
MNC.633 control control
MNC.151 control control
MNC.595 control control
MNC.571 control control
MNC.452.y control bacterial
MNC.294 control control
MNC.172 control control
MNC.612 control control
MNC.392 control viral
MNC.631 control bacterial
MNC.173 control control
MNC.676 control control
MNC.213 control control
MNC.533 control control
MNC.354 control control
MNC.675 control control
MN_323 fungal bacterial
MN_382 viral bacterial
MNC.033 viral control
MN_207 viral control
MNC.091 viral viral
MN_097.y viral viral
MNC.215 viral viral
MNC.474 viral viral
MNC.012.y viral control
MN_100 viral control
MNC.534 viral control
MNC.674 viral control
MN_282 viral viral
MN_171 viral viral
MN_284 viral bacterial
MNC.155 viral bacterial
MNC.014 viral viral

Review patients the model misclassified


In [9]:
patient_df[patient_df["Predicted"] != patient_df["Actual"]]


Out[9]:
Actual Predicted
MNC.557 bacterial control
MN_284 viral bacterial
MN_323 fungal bacterial
MNC.674 viral control
MNC.534 viral control
MNC.631 control bacterial
MNC.392 control viral
MN_307 bacterial control
MN_100 viral control
MNC.012.y viral control
MNC.214 bacterial control
MNC.155 viral bacterial
MN_207 viral control
MNC.452.y control bacterial
MN_382 viral bacterial
MNC.033 viral control