In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import pow
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import LinearSVC, LinearSVR
random_state = 7
In [2]:
patient_groups=["control", "viral", "bacterial", "fungal"]
group_id = lambda name: patient_groups.index(name)
X = pd.DataFrame.from_csv("combineSV_WTcpmtable_v2.txt", sep="\s+").T
# + [group_id("fungal")] * 10 \
y = [group_id("bacterial")] * 29 \
+ [group_id("viral")] * 42 \
+ [group_id("control")] * 61
# Drop the fungal patients
fv = range(29+42, 29+42+10)
X = X.drop(X.index[fv])
print "Complete data set has %d samples and %d features." % (X.shape[0], X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=random_state,
stratify=y)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))
In [3]:
def print_gridsearch_results(clf):
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
In [4]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
#parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
est = LogisticRegression(class_weight="balanced", C=pow(2, -23))
#est = LinearSVR(C=pow(2, -23))
#clf = RFE(estimator=lr, step=0.01, n_features_to_select=1, verbose=1)
clf = RFECV(cv=4, estimator=est, n_jobs=6, scoring='neg_mean_squared_error', step=.001, verbose=0)
#clf = GridSearchCV(rfe, parameters, scoring='accuracy', n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)
#print_gridsearch_results(clf)
Out[4]:
In [5]:
from IPython.core.display import display, HTML
rfe_features = []
for (rank, name) in zip(clf.ranking_, X.columns):
if rank == 1:
rfe_features.append(name)
s="""
<h2>List of %d genes found by RFE</h2>
<p>Note: the NCBI link will open the target in a new window or tab.</p>
<table>
""" % (clf.n_features_)
ncbi_url = lambda gene: "https://www.ncbi.nlm.nih.gov/gene/?term=%s[Gene Name] AND Human[Organism]" % (gene)
s += "<tr>"
for (i, gene) in enumerate(rfe_features):
if not i % 10:
s += "</tr><tr>"
s += """
<td><a target=\"_blank\" href=\"%s\">%s</a></td>
""" %(ncbi_url(gene), gene)
s += "</tr></table>"
display(HTML(s))
In [6]:
best_estimator = clf
print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X.shape[1] - best_estimator.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()
In [7]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
def create_learning_curve(title, model):
cv = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=random_state)
plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
create_learning_curve("Learning Curves Full Feature Set", est)
create_learning_curve("Learning Curves RFE Feature Set", clf.estimator_)
plt.show()
In [11]:
from classification_metrics import classification_metrics
est.fit(X_train, y_train)
est_predicted = est.predict(X_test)
print "Full Metrics"
classification_metrics(y_test, est_predicted, patient_groups)
rfe_predicted = clf.predict(X_test)
print "-" * 80
print "\nRFE Metrics"
classification_metrics(y_test, rfe_predicted, patient_groups)
In [12]:
probs = pd.DataFrame(clf.predict_proba(X_test))
probstrs = lambda vals: ["%.2f" % (p*100) for p in vals]
d = {"Predicted": [patient_groups[i] for i in rfe_predicted],
"Actual": [patient_groups[i] for i in y_test],
"Prob. control": probstrs(probs[0]),
"Prob. viral": probstrs(probs[1]),
"Prob. bacteria": probstrs(probs[2])}
patient_df = pd.DataFrame(d, index=X_test.index)
patient_df.sort_values(by="Actual")
Out[12]:
In [13]:
patient_df[patient_df["Predicted"] != patient_df["Actual"]]
Out[13]: