Can specific miRNAs or a set of principal components be used to distinguish classes i.e. control vs infected.
References:
Y. Taguchi and Y. Murakami, “Principal component analysis based feature extraction approach to identify circulating microRNA biomarkers.,” PLoS One, vol. 8, no. 6, p. e66714, 2013.
In [1]:
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
from mirnaseq import base, analysis
pd.set_option('display.width', 130)
%matplotlib inline
import pylab as plt
from mpl_toolkits.mplot3d import Axes3D
base.seabornsetup()
plt.rcParams['savefig.dpi']=80
plt.rcParams['font.size']=16
home = os.path.expanduser("~")
In [2]:
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition.pca import PCA, RandomizedPCA
from sklearn.lda import LDA
In [46]:
#iconmap data
path = os.path.join(home,'mirnaseq/data/iconmap/analysis/')
os.chdir(path)
respath = 'iconmap_results_mirdeep'
ic = mdp.getResults(respath)
ic = mdp.filterExprResults(ic,meanreads=150,freq=0.8)
k = ic[ic.novel==False]
condmap = mdp.getLabelMap(respath,'samplelabels.csv')
condmap = condmap.sort('id')
labels=['control','infected']
y=condmap.status.values
#labels=[0,6]
#y=condmap.month.values
In [53]:
#douwe data
path = os.path.join(home,'mirnaseq/data/douwe/analysis/')
os.chdir(path)
respath = 'douwe_mirdeep_rnafiltered'
d = mdp.getResults(respath)
d = mdp.filterExprResults(d,meanreads=150,freq=0.8)
k = d[d.novel==False]
condmap = mdp.getLabelMap(respath,'samplelabels.csv')
condmap = condmap.sort('id')
#labels=['START','EARLY','LATE']
#y=condmap.timegroup.values
#labels=[1,2]
#y=condmap.pool.values
print condmap[:5]
In [54]:
#use normalised columns
scols = condmap.id+'(norm)'
df = k.set_index('#miRNA')
X=df[scols].T
X=X.dropna(axis=1)
X=X.drop_duplicates()
names = list(X.columns)
X=X.values
In [55]:
#standardize data
from sklearn import preprocessing
#scaler = preprocessing.StandardScaler().fit(X)
scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)
In [7]:
def plot2D(X, x1=0, x2=1, ax=None):
if ax == None:
plt.figure(figsize=(6,6))
for c, i in zip("brgyc", labels):
ax.scatter(X[y == i, x1], X[y == i, x2], c=c, s=100, label=i, alpha=0.8)
ax.legend()
return
In [8]:
def plot3D(X):
fig = plt.figure(figsize=(8,6))
ax = Axes3D(fig, rect=[0, 0, .95, 1])
for c, i in zip("brgyc", labels):
ax.scatter(X[y == i, 0], X[y == i, 1], X[y == i, 2], c=c, s=100, label=i)
plt.legend()
for label in labels:
ax.text3D(X[y == label, 0].mean(),
X[y == label, 1].mean() + 1.5,
X[y == label, 2].mean(), label,
horizontalalignment='center',
bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
In [9]:
def doPCA(X):
#do PCA
pca = PCA(n_components=5)
pca.fit(X)
print pca.explained_variance_ratio_
Xt = pca.fit_transform(X)
f,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))
plot2D(X, ax=ax1)
plot2D(Xt, ax=ax2)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.tight_layout()
#plot3D(X)
plot3D(Xt)
return Xt
In [10]:
def plotClassification(cl, X, ax=None):
"""Plot classifier result on a 2D axis"""
if ax == None:
f = plt.figure(figsize=(6,6))
ax = f.add_subplot(111)
X = X[:, :2]
cl.fit(X, y)
h = .04
e=0.1
x_min, x_max = X[:, 0].min() - e, X[:, 0].max() + e
y_min, y_max = X[:, 1].min() - e, X[:, 1].max() + e
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = cl.predict(np.c_[xx.ravel(), yy.ravel()])
#get cats as integers
Z = pd.Categorical(Z).labels
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.pcolormesh(xx, yy, Z, cmap=plt.cm.coolwarm)
for c, i in zip("brgyc", labels):
ax.scatter(X[y == i, 0], X[y == i, 1], c=c, s=100, label=i, lw=2)
ax.legend()
return ax
In [11]:
def classify(X, y, cl, name=''):
"""Test classification"""
np.random.seed()
ind = np.random.permutation(len(X))
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4)
cl.fit(Xtrain, ytrain)
ypred = cl.predict(Xtest)
print classification_report(ytest, ypred)
#print accuracy_score(ytest, ypred)
from sklearn import cross_validation
yl = pd.Categorical(y).labels
sc = cross_validation.cross_val_score(cl, X, yl, scoring='roc_auc', cv=5)
print("AUC: %0.2f (+/- %0.2f)" % (sc.mean(), sc.std() * 2))
ax = plotClassification(cl, X)
ax.set_title(name)
return cl
Do a principal component analysis with the normalised data and plot the first 2 PCs by category
In [64]:
labels = ['CTRL','MAP']
y = condmap.infection.values
Xt = doPCA(X)
Create a classifier using the miRNA abuandances as features
In [21]:
knn = KNeighborsClassifier()
logit = linear_model.LogisticRegression(C=1e5)
svm = SVC(kernel='linear')
classify(X, y, knn, 'knn')
Out[21]:
In [14]:
cl = classify(X, y, logit, 'logistic regression')
In [15]:
cl2 = classify(Xt, y, knn, 'knn pca features')
In [16]:
labels = ['Neg','Pos']
y = condmap.ELISA.values
Xt = doPCA(X)
In [17]:
cl = classify(X, y, knn, 'knn')
Find best parameters using feature selection with original column data (no PCA)
In [65]:
yl = pd.Categorical(y).labels
#clf = KNeighborsClassifier()
clf = linear_model.LogisticRegression(penalty='l1')
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, SelectPercentile
s = SelectKBest(k=10)
#s = SelectPercentile(percentile = 60)
s.fit_transform(X, y)
'''res = sorted(zip(map(lambda x: round(x, 4), s.scores_),
names), reverse=True)
for i in res[:20]:
print i'''
# create the RFE model and select attributes
fs = RFE(clf, 10)
fs = fs.fit(X, y)
res = sorted(zip(map(lambda x: round(x, 4), fs.ranking_),
names))
for i in res[:20]:
print i
In [68]:
#using a pipeline to find best no. of principle components as features for prediction
labels = ['CTRL','MAP']
y = condmap.infection.values
pca = RandomizedPCA()
pca.fit(X)
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('pca', pca), ('knn', cl)])
components = range(3,30,2)
Cs = np.logspace(-4, 4, 3)
params = dict(pca__n_components=components) #,logistic__C=Cs)
est = GridSearchCV(pipe, params, cv=3)
est.fit(X, y)
print est.best_estimator_.named_steps['pca'].n_components
plt.figure(figsize=(6,3))
plt.plot(pca.explained_variance_, linewidth=2)
plt.axvline(est.best_estimator_.named_steps['pca'].n_components,
linestyle=':', label='n_components chosen')
plt.legend()
Out[68]:
In [66]:
def doLDA(X,y):
lda = LDA(n_components=2)
Xl = lda.fit_transform(X,y)
#ypred = lda.fit(X, y).predict(X)
fig = plt.figure(figsize=(6,6))
for c, i in zip("grb", labels):
#plt.scatter(Xl[y == i, 0], Xl[y == i, 1], c=c, s=100, label=i)
plt.plot(Xl[y == i, 0], 'o', c=c, label=i)
plt.legend()
In [67]:
labels = ['Neg','Pos']
y = condmap.ELISA.values
doLDA(X,y)
In [ ]: