In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV, RFE

random_state = 7

In [2]:
df = pd.DataFrame.from_csv("Lyme86genes215samples.txt", sep="\s+").T
df.replace(to_replace={'Disease': {"Lyme": 1, "Others": 0}}, inplace=True)

# Only use PBMC samples
df = df[df.Sampletype == "PBMC"]

# Remove columns that are not features
X = df.drop(["Disease", "Sampletype", "Serology"], axis=1)
y = df['Disease']

print "Complete data set has %d samples (%d w/Lyme) and %d features." % \
                (X.shape[0], y[y == 1].count(), X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=random_state, 
                                                    stratify=y)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))


Complete data set has 190 samples (60 w/Lyme) and 86 features.
Training set has 152 samples. Testing set has 38 samples.

In [3]:
def print_gridsearch_results(clf):
    print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    params = clf.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
#est = LogisticRegression(class_weight="balanced")
est = LinearSVC(dual=True, class_weight="balanced")
scoring = "accuracy"
rfe = RFECV(cv=5, estimator=est, n_jobs=1, scoring=scoring, step=1, verbose=0)
clf = GridSearchCV(rfe, parameters, scoring=scoring, n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)
print_gridsearch_results(clf)


Fitting 3 folds for each of 29 candidates, totalling 87 fits
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   16.5s
[Parallel(n_jobs=8)]: Done  87 out of  87 | elapsed:   44.0s finished
Best: 0.828947 using {'estimator__C': 7.62939453125e-06}
0.743421 (0.041084) with: {'estimator__C': 2.9802322387695312e-08}
0.730263 (0.026588) with: {'estimator__C': 5.960464477539063e-08}
0.736842 (0.019926) with: {'estimator__C': 1.1920928955078125e-07}
0.736842 (0.051145) with: {'estimator__C': 2.384185791015625e-07}
0.763158 (0.044103) with: {'estimator__C': 4.76837158203125e-07}
0.796053 (0.049480) with: {'estimator__C': 9.5367431640625e-07}
0.815789 (0.034762) with: {'estimator__C': 1.9073486328125e-06}
0.776316 (0.075383) with: {'estimator__C': 3.814697265625e-06}
0.828947 (0.010185) with: {'estimator__C': 7.62939453125e-06}
0.815789 (0.019499) with: {'estimator__C': 1.52587890625e-05}
0.822368 (0.014727) with: {'estimator__C': 3.0517578125e-05}
0.815789 (0.011055) with: {'estimator__C': 6.103515625e-05}
0.796053 (0.034895) with: {'estimator__C': 0.0001220703125}
0.802632 (0.069093) with: {'estimator__C': 0.000244140625}
0.743421 (0.056975) with: {'estimator__C': 0.00048828125}
0.756579 (0.031444) with: {'estimator__C': 0.0009765625}
0.736842 (0.076745) with: {'estimator__C': 0.001953125}
0.809211 (0.032330) with: {'estimator__C': 0.00390625}
0.769737 (0.032918) with: {'estimator__C': 0.0078125}
0.782895 (0.056666) with: {'estimator__C': 0.015625}
0.736842 (0.078342) with: {'estimator__C': 0.03125}
0.776316 (0.093487) with: {'estimator__C': 0.0625}
0.703947 (0.048266) with: {'estimator__C': 0.125}
0.796053 (0.064088) with: {'estimator__C': 0.25}
0.776316 (0.039429) with: {'estimator__C': 0.5}
0.809211 (0.048775) with: {'estimator__C': 1}
0.782895 (0.048230) with: {'estimator__C': 2}
0.809211 (0.026005) with: {'estimator__C': 4}
0.802632 (0.014573) with: {'estimator__C': 8}

In [4]:
best_estimator = clf.best_estimator_

print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X_train.shape[1] - best_estimator.n_features_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()


Optimal number of features : 40
Recursive Feature Elimination (RFE) eliminated 46 features

In [5]:
from IPython.core.display import display, HTML

rfe_features = []
for (rank, name) in zip(best_estimator.ranking_, X.columns):
    if rank == 1:
        rfe_features.append(name)

s="""
<h2>List of %d genes found by RFE</h2>
<p>Note: the NCBI link will open the target in a new window or tab.</p>
<table>
""" % (best_estimator.n_features_)

ncbi_url = lambda gene: "https://www.ncbi.nlm.nih.gov/gene/?term=%s[Gene Name] AND Human[Organism]" % (gene)

s += "<tr>"
for (i, gene) in enumerate(rfe_features):
    if not i % 10:
        s += "</tr><tr>"
    s += """
    <td><a target=\"_blank\" href=\"%s\">%s</a></td>
    """ %(ncbi_url(gene), gene)
s += "</tr></table>"

display(HTML(s))


List of 40 genes found by RFE

Note: the NCBI link will open the target in a new window or tab.

ANPEP ANXA5 ASPM C3orf14 CASC5 CASP1 CCDC130 CCNB1 CD55 CDCA2
CKAP4 CR1 CXCL10 DRAM1 EIF2D FPR2 GBP2 GBP4 HMBS ICAM1
IFRD1 ITGB7 KCNJ2 LDLR MLF1IP NCF1 NUSAP1 OAS2 RAB12 RRM2
SOCS3 SORT1 SPAG5 STEAP4 SYTL1 TLR2 TNFSF13B TYMP ZNF276 ZNF384

In [6]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

def create_learning_curve(title, model):
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)                                     
    plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
    
create_learning_curve("Learning Curves", est)

plt.show()


None

In [7]:
from classification_metrics import classification_metrics

est_predicted = clf.predict(X_test)
classification_metrics(y_test, est_predicted, ["Control", "Lyme"])


Accuracy was 84.21%

             precision    recall  f1-score   support

    Control       0.92      0.85      0.88        26
       Lyme       0.71      0.83      0.77        12

avg / total       0.85      0.84      0.85        38

Confusion Matrix: cols = predictions, rows = actual

                       Control           Lyme
        Control             22              4
           Lyme              2             10