In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV, RFE
random_state = 7
In [2]:
df = pd.DataFrame.from_csv("Lyme86genes215samples.txt", sep="\s+").T
df.replace(to_replace={'Disease': {"Lyme": 1, "Others": 0}}, inplace=True)
# Only use PBMC samples
df = df[df.Sampletype == "PBMC"]
# Remove columns that are not features
X = df.drop(["Disease", "Sampletype", "Serology"], axis=1)
y = df['Disease']
print "Complete data set has %d samples (%d w/Lyme) and %d features." % \
(X.shape[0], y[y == 1].count(), X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=random_state,
stratify=y)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))
In [3]:
def print_gridsearch_results(clf):
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
#est = LogisticRegression(class_weight="balanced")
est = LinearSVC(dual=True, class_weight="balanced")
scoring = "accuracy"
rfe = RFECV(cv=5, estimator=est, n_jobs=1, scoring=scoring, step=1, verbose=0)
clf = GridSearchCV(rfe, parameters, scoring=scoring, n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)
print_gridsearch_results(clf)
In [4]:
best_estimator = clf.best_estimator_
print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X_train.shape[1] - best_estimator.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()
In [5]:
from IPython.core.display import display, HTML
rfe_features = []
for (rank, name) in zip(best_estimator.ranking_, X.columns):
if rank == 1:
rfe_features.append(name)
s="""
<h2>List of %d genes found by RFE</h2>
<p>Note: the NCBI link will open the target in a new window or tab.</p>
<table>
""" % (best_estimator.n_features_)
ncbi_url = lambda gene: "https://www.ncbi.nlm.nih.gov/gene/?term=%s[Gene Name] AND Human[Organism]" % (gene)
s += "<tr>"
for (i, gene) in enumerate(rfe_features):
if not i % 10:
s += "</tr><tr>"
s += """
<td><a target=\"_blank\" href=\"%s\">%s</a></td>
""" %(ncbi_url(gene), gene)
s += "</tr></table>"
display(HTML(s))
In [6]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
def create_learning_curve(title, model):
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
create_learning_curve("Learning Curves", est)
plt.show()
In [7]:
from classification_metrics import classification_metrics
est_predicted = clf.predict(X_test)
classification_metrics(y_test, est_predicted, ["Control", "Lyme"])