In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import pow
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import LinearSVC, LinearSVR
In [2]:
patient_groups=["control", "viral", "bacterial", "fungal"]
group_id = lambda name: patient_groups.index(name)
df = pd.DataFrame.from_csv("combineSV_WTcpmtable_v2+LeptoCase.txt", sep="\s+").T
num_genes = df.shape[1]
df['coverage'] = df.astype(bool).sum(axis=1)/num_genes*100
df['diagnosis'] = [group_id("bacterial")] * 30 \
+ [group_id("viral")] * 42 \
+ [group_id("control")] * 61
drop_diagnosis = lambda df: df.drop("diagnosis", axis=1)
lepto_index = "Lepto_Case"
X_test = drop_diagnosis(df[df.index == lepto_index])
y_test = [group_id("bacterial")]
df.drop(lepto_index, inplace=True)
X_train = drop_diagnosis(df)
y_train = df['diagnosis']
print "Training set has %d samples. Testing set has %d samples." % (len(X_train), len(X_test))
Training set has 132 samples. Testing set has 1 samples.
In [3]:
def print_gridsearch_results(clf):
print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
In [4]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
#parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
est = LogisticRegression(class_weight="balanced", C=pow(2, -23))
#est = LinearSVR(C=pow(2, -23))
#clf = RFE(estimator=lr, step=0.01, n_features_to_select=1, verbose=1)
clf = RFECV(cv=4, estimator=est, n_jobs=6, scoring='neg_mean_squared_error', step=.001, verbose=0)
#clf = GridSearchCV(rfe, parameters, scoring='accuracy', n_jobs=8, cv=3, verbose=1)
clf.fit(X_train, y_train)
#print_gridsearch_results(clf)
Out[4]:
RFECV(cv=4,
estimator=LogisticRegression(C=1.19209289551e-07, class_weight='balanced', dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
n_jobs=6, scoring='neg_mean_squared_error', step=0.001, verbose=0)
In [5]:
from IPython.core.display import display, HTML
rfe_features = []
for (rank, name) in zip(clf.ranking_, X_test.columns):
if rank == 1:
rfe_features.append(name)
s="""
<h2>List of %d genes found by RFE</h2>
<p>Note: the NCBI link will open the target in a new window or tab.</p>
<table>
""" % (clf.n_features_)
ncbi_url = lambda gene: "https://www.ncbi.nlm.nih.gov/gene/?term=%s[Gene Name] AND Human[Organism]" % (gene)
s += "<tr>"
for (i, gene) in enumerate(rfe_features):
if not i % 10:
s += "</tr><tr>"
s += """
<td><a target=\"_blank\" href=\"%s\">%s</a></td>
""" %(ncbi_url(gene), gene)
s += "</tr></table>"
display(HTML(s))
List of 8935 genes found by RFE
Note: the NCBI link will open the target in a new window or tab.
In [6]:
best_estimator = clf
print("Optimal number of features : %d" % best_estimator.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (X_train.shape[1] - best_estimator.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(best_estimator.grid_scores_) + 1), best_estimator.grid_scores_)
plt.show()
Optimal number of features : 8935
Recursive Feature Elimination (RFE) eliminated 14789 features
In [7]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
def create_learning_curve(title, model):
cv = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=7)
plot_learning_curve(model, title, X_test, y_test, (0.2, 1.01), cv=cv, n_jobs=1)
create_learning_curve("Learning Curves Full Feature Set", est)
create_learning_curve("Learning Curves RFE Feature Set", clf.estimator_)
plt.show()
None
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-52b2b3ac6fcf> in <module>()
8 plot_learning_curve(model, title, X_test, y_test, (0.2, 1.01), cv=cv, n_jobs=1)
9
---> 10 create_learning_curve("Learning Curves Full Feature Set", est)
11 create_learning_curve("Learning Curves RFE Feature Set", clf.estimator_)
12
<ipython-input-7-52b2b3ac6fcf> in create_learning_curve(title, model)
6 def create_learning_curve(title, model):
7 cv = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=7)
----> 8 plot_learning_curve(model, title, X_test, y_test, (0.2, 1.01), cv=cv, n_jobs=1)
9
10 create_learning_curve("Learning Curves Full Feature Set", est)
/Users/matt/workspace/notebooks/learning_curves.pyc in plot_learning_curve(estimator, title, X, y, ylim, cv, n_jobs, train_sizes)
59 plt.ylabel("Score")
60 train_sizes, train_scores, test_scores = learning_curve(
---> 61 estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
62 train_scores_mean = np.mean(train_scores, axis=1)
63 train_scores_std = np.std(train_scores, axis=1)
/Users/matt/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in learning_curve(estimator, X, y, groups, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose)
746 cv = check_cv(cv, y, classifier=is_classifier(estimator))
747 # Make a list since we will be iterating multiple times over the folds
--> 748 cv_iter = list(cv.split(X, y, groups))
749 scorer = check_scoring(estimator, scoring=scoring)
750
/Users/matt/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_split.pyc in split(self, X, y, groups)
951 """
952 X, y, groups = indexable(X, y, groups)
--> 953 for train, test in self._iter_indices(X, y, groups):
954 yield train, test
955
/Users/matt/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_split.pyc in _iter_indices(self, X, y, groups)
1257 class_counts = bincount(y_indices)
1258 if np.min(class_counts) < 2:
-> 1259 raise ValueError("The least populated class in y has only 1"
1260 " member, which is too few. The minimum"
1261 " number of groups for any class cannot"
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
In [ ]:
from classification_metrics import classification_metrics
est.fit(X_train, y_train)
est_predicted = est.predict(X_test)
print "Full Metrics"
classification_metrics(y_test, est_predicted, patient_groups)
rfe_predicted = clf.predict(X_test)
print "-" * 80
print "\nRFE Metrics"
classification_metrics(y_test, rfe_predicted, patient_groups)
In [ ]:
probs = pd.DataFrame(clf.predict_proba(X_test))
probstrs = lambda vals: ["%.2f" % (p*100) for p in vals]
d = {"Predicted": [patient_groups[i] for i in rfe_predicted],
"Actual": [patient_groups[i] for i in y_test],
"Prob. control": probstrs(probs[0]),
"Prob. viral": probstrs(probs[1]),
"Prob. bacteria": probstrs(probs[2])}
patient_df = pd.DataFrame(d, index=X_test.index)
patient_df.sort_values(by="Actual")
In [ ]:
patient_df[patient_df["Predicted"] != patient_df["Actual"]]
Content source: massie/notebooks
Similar notebooks: