In [1]:
from math import pow
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.pipeline import Pipeline
random_state = 0xDEADCAFE
use_gsea_genes = True
rfe_step = 0.02
In [2]:
if use_gsea_genes:
with open("geneSet.txt") as gsfile:
gsea_genes = list(gsfile.read().splitlines())[1:]
rfe_step = 1
In [3]:
patient_groups=["control", "viral", "bacterial", "fungal"]
group_id = lambda name: patient_groups.index(name)
X = pd.DataFrame.from_csv("combineSV_WTcpmtable_v2.txt", sep="\s+").T
y = [group_id("bacterial")] * 29 \
+ [group_id("viral")] * 42 \
+ [group_id("fungal")] * 10 \
+ [group_id("control")] * 61
if use_gsea_genes:
X = X.filter(gsea_genes, axis=1)
print "Complete data set has %d samples and %d features." % (X.shape[0], X.shape[1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))
In [4]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.linear_model import RandomizedLogisticRegression
parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
svc = LinearSVC(class_weight='balanced')
rfe = RFECV(estimator=svc, step=rfe_step, cv=2, scoring='accuracy', n_jobs=1)
clf = GridSearchCV(rfe, parameters, scoring='accuracy', n_jobs=8, cv=2, verbose=1)
clf.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
In [5]:
best_estimator = clf.best_estimator_
print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X.shape[1] - best_estimator.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()
In [6]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
def create_learning_curve(title, model):
cv = ShuffleSplit(n_splits=2, test_size=0.3, random_state=random_state)
plot_learning_curve(model, title, X, y, (0.2, 1.01), cv=cv, n_jobs=1)
create_learning_curve("Learning Curves Full Feature Set", svc)
create_learning_curve("Learning Curves Limited Feature Set", best_estimator.estimator_)
plt.show()
In [7]:
from classification_metrics import classification_metrics
# make predictions
predicted = best_estimator.predict(X_test)
classification_metrics(y_test, predicted, patient_groups)
In [8]:
probs = np.array(best_estimator.predict(X_test))
d = {"Predicted": [patient_groups[i] for i in best_estimator.predict(X_test)],
"Actual": [patient_groups[i] for i in y_test]}
patient_df = pd.DataFrame(d, index=X_test.index)
patient_df.sort_values(by="Actual")
Out[8]:
In [9]:
patient_df[patient_df["Predicted"] != patient_df["Actual"]]
Out[9]: