notebook.community

Edit and run



In [1]:

    
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV, RFE

random_state = 7



In [2]:

    
df = pd.DataFrame.from_csv("Lyme86genes215samples.txt", sep="\s+").T
df.replace(to_replace={'Disease': {"Lyme": 1, "Others": 0}}, inplace=True)

# Only use PBMC samples
df = df[df.Sampletype == "PBMC"]

# Remove columns that are not features
X = df.drop(["Disease", "Sampletype", "Serology"], axis=1)
y = df['Disease']

print "Complete data set has %d samples (%d w/Lyme) and %d features." % \
                (X.shape[0], y[y == 1].count(), X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=random_state, 
                                                    stratify=y)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))









    



Complete data set has 190 samples (60 w/Lyme) and 86 features.
Training set has 152 samples. Testing set has 38 samples.



In [3]:

    
def print_gridsearch_results(clf):
    print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    params = clf.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
#est = LogisticRegression(class_weight="balanced")
est = LinearSVC(dual=True, class_weight="balanced")
scoring = "accuracy"
rfe = RFECV(cv=5, estimator=est, n_jobs=1, scoring=scoring, step=1, verbose=0)
clf = GridSearchCV(rfe, parameters, scoring=scoring, n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)
print_gridsearch_results(clf)









    



Fitting 3 folds for each of 29 candidates, totalling 87 fits






    



[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   16.5s
[Parallel(n_jobs=8)]: Done  87 out of  87 | elapsed:   44.0s finished






    



Best: 0.828947 using {'estimator__C': 7.62939453125e-06}
0.743421 (0.041084) with: {'estimator__C': 2.9802322387695312e-08}
0.730263 (0.026588) with: {'estimator__C': 5.960464477539063e-08}
0.736842 (0.019926) with: {'estimator__C': 1.1920928955078125e-07}
0.736842 (0.051145) with: {'estimator__C': 2.384185791015625e-07}
0.763158 (0.044103) with: {'estimator__C': 4.76837158203125e-07}
0.796053 (0.049480) with: {'estimator__C': 9.5367431640625e-07}
0.815789 (0.034762) with: {'estimator__C': 1.9073486328125e-06}
0.776316 (0.075383) with: {'estimator__C': 3.814697265625e-06}
0.828947 (0.010185) with: {'estimator__C': 7.62939453125e-06}
0.815789 (0.019499) with: {'estimator__C': 1.52587890625e-05}
0.822368 (0.014727) with: {'estimator__C': 3.0517578125e-05}
0.815789 (0.011055) with: {'estimator__C': 6.103515625e-05}
0.796053 (0.034895) with: {'estimator__C': 0.0001220703125}
0.802632 (0.069093) with: {'estimator__C': 0.000244140625}
0.743421 (0.056975) with: {'estimator__C': 0.00048828125}
0.756579 (0.031444) with: {'estimator__C': 0.0009765625}
0.736842 (0.076745) with: {'estimator__C': 0.001953125}
0.809211 (0.032330) with: {'estimator__C': 0.00390625}
0.769737 (0.032918) with: {'estimator__C': 0.0078125}
0.782895 (0.056666) with: {'estimator__C': 0.015625}
0.736842 (0.078342) with: {'estimator__C': 0.03125}
0.776316 (0.093487) with: {'estimator__C': 0.0625}
0.703947 (0.048266) with: {'estimator__C': 0.125}
0.796053 (0.064088) with: {'estimator__C': 0.25}
0.776316 (0.039429) with: {'estimator__C': 0.5}
0.809211 (0.048775) with: {'estimator__C': 1}
0.782895 (0.048230) with: {'estimator__C': 2}
0.809211 (0.026005) with: {'estimator__C': 4}
0.802632 (0.014573) with: {'estimator__C': 8}



In [4]:

    
best_estimator = clf.best_estimator_

print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X_train.shape[1] - best_estimator.n_features_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()









    



Optimal number of features : 40
Recursive Feature Elimination (RFE) eliminated 46 features



In [5]:

    
from IPython.core.display import display, HTML

rfe_features = []
for (rank, name) in zip(best_estimator.ranking_, X.columns):
    if rank == 1:
        rfe_features.append(name)

s="""
<h2>List of %d genes found by RFE</h2>
<p>Note: the NCBI link will open the target in a new window or tab.</p>
<table>
""" % (best_estimator.n_features_)

ncbi_url = lambda gene: "https://www.ncbi.nlm.nih.gov/gene/?term=%s[Gene Name] AND Human[Organism]" % (gene)

s += "<tr>"
for (i, gene) in enumerate(rfe_features):
    if not i % 10:
        s += "</tr><tr>"
    s += """
    <td><a target=\"_blank\" href=\"%s\">%s</a></td>
    """ %(ncbi_url(gene), gene)
s += "</tr></table>"

display(HTML(s))









    





List of 40 genes found by RFE
Note: the NCBI link will open the target in a new window or tab.


    ANPEP
    
    ANXA5
    
    ASPM
    
    C3orf14
    
    CASC5
    
    CASP1
    
    CCDC130
    
    CCNB1
    
    CD55
    
    CDCA2
    

    CKAP4
    
    CR1
    
    CXCL10
    
    DRAM1
    
    EIF2D
    
    FPR2
    
    GBP2
    
    GBP4
    
    HMBS
    
    ICAM1
    

    IFRD1
    
    ITGB7
    
    KCNJ2
    
    LDLR
    
    MLF1IP
    
    NCF1
    
    NUSAP1
    
    OAS2
    
    RAB12
    
    RRM2
    

    SOCS3
    
    SORT1
    
    SPAG5
    
    STEAP4
    
    SYTL1
    
    TLR2
    
    TNFSF13B
    
    TYMP
    
    ZNF276
    
    ZNF384



In [6]:

    
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

def create_learning_curve(title, model):
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)                                     
    plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
    
create_learning_curve("Learning Curves", est)

plt.show()









    



None



In [7]:

    
from classification_metrics import classification_metrics

est_predicted = clf.predict(X_test)
classification_metrics(y_test, est_predicted, ["Control", "Lyme"])









    



Accuracy was 84.21%

             precision    recall  f1-score   support

    Control       0.92      0.85      0.88        26
       Lyme       0.71      0.83      0.77        12

avg / total       0.85      0.84      0.85        38

Confusion Matrix: cols = predictions, rows = actual

                       Control           Lyme
        Control             22              4
           Lyme              2             10

ANPEP	ANXA5	ASPM	C3orf14	CASC5	CASP1	CCDC130	CCNB1	CD55	CDCA2
CKAP4	CR1	CXCL10	DRAM1	EIF2D	FPR2	GBP2	GBP4	HMBS	ICAM1
IFRD1	ITGB7	KCNJ2	LDLR	MLF1IP	NCF1	NUSAP1	OAS2	RAB12	RRM2
SOCS3	SORT1	SPAG5	STEAP4	SYTL1	TLR2	TNFSF13B	TYMP	ZNF276	ZNF384