The data we use in this example are publically available microarray data.
The basic idea here is to summarize a data set with a large number of features in lower dimensional space. Then using that lower dimensional projection determine in a systematic way if there are outlier samples.
The data are atopic-asthmatic (AA) subjects and healthy-nonasthmatic (HN) controls.
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.covariance import EllipticEnvelope
In [2]:
mat = np.loadtxt("gse18965.txt")
covs = pd.read_csv("gse18965-targets.csv",usecols=[0,1,2,4,5])
print(covs)
covsAdded = pd.DataFrame([["X","OT",999,11.1,'M'],["Y","OT",998,12.5,"F"]],
columns=['sample','phenotype','subject','age','gender'])
covs = covs.append(covsAdded,ignore_index=True)
print(covs)
print mat.shape
print mat.mean(axis=0).shape, mat.mean(axis=0).mean()
mat = np.vstack([mat,mat.mean(axis=0)*1.5])
mat = np.vstack([mat,mat.mean(axis=0)*1.6])
print mat.shape
In [2]:
from IPython.display import Image
matScaled = preprocessing.scale(mat)
fit1 = PCA(n_components=2).fit_transform(mat)
fit2 = TSNE(learning_rate=100,perplexity=10,n_iter=2000).fit_transform(mat)
fit3 = PCA(n_components=2).fit_transform(matScaled)
fit4 = TSNE(learning_rate=100,perplexity=10,n_iter=2000).fit_transform(matScaled)
def make_subplot(fit,covs,covariate,ax,pcX=0,pcY=1,fontSize=10,fontName='sans serif',ms=20,leg=True,title=None):
colors = ['k','cyan','r','orange','g','b','magenta']
cvNames = np.sort(np.unique(covs[covariate]))
lines = []
for _i,i in enumerate(cvNames):
indices = np.where(covs[covariate]==i)[0]
s = ax.scatter(fit[indices,pcX],fit[indices,pcY],c=colors[_i],s=ms,label=covariate,alpha=0.9)
lines.append(s)
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(fontSize-2)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(fontSize-2)
buff = 0.02
bufferX = buff * (fit[:,pcX].max() - fit[:,pcX].min())
bufferY = buff * (fit[:,pcY].max() - fit[:,pcY].min())
ax.set_xlim([fit[:,pcX].min()-bufferX,fit[:,pcX].max()+bufferX])
ax.set_ylim([fit[:,pcY].min()-bufferY,fit[:,pcY].max()+bufferY])
ax.set_xlabel("D-%s"%str(pcX+1),fontsize=fontSize,fontname=fontName)
ax.set_ylabel("D-%s"%str(pcY+1),fontsize=fontSize,fontname=fontName)
plt.locator_params(axis='x',nbins=5)
ax.set_aspect(1./ax.get_data_ratio())
if title:
ax.set_title(title,fontsize=fontSize+2,fontname=fontName)
if leg:
legend = ax.legend(lines,cvNames,loc='upper right',scatterpoints=1,
handletextpad=0.01,labelspacing=0.01,borderpad=0.1,handlelength=1.0)
for label in legend.get_texts():
label.set_fontsize(fontSize-2)
label.set_fontname(fontName)
plt.clf()
fig = plt.figure()
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
make_subplot(fit1,covs,'phenotype',ax1,pcX=0,pcY=1,leg=True,title='PCA-raw')
make_subplot(fit2,covs,'phenotype',ax2,pcX=0,pcY=1,leg=False,title='tSNE-raw')
make_subplot(fit3,covs,'phenotype',ax3,pcX=0,pcY=1,leg=False,title='PCA-scaled')
make_subplot(fit4,covs,'phenotype',ax4,pcX=0,pcY=1,leg=False,title='tSNE-scaled')
ax2.set_xlabel("")
ax4.set_xlabel("")
plt.subplots_adjust(hspace=0.3,wspace=0.05)
plt.savefig("outliers-projection.png",dpi=600)
In [4]:
Image(filename='outliers-projection.png')
Out[4]:
Because we are not that interested in relative fold change differences, i.e. we do not want the scale of expression differences to dominate the projection we use a standarization. Based on these plot it appears that PCA does a better job than tSNE of separating outliers. Depending on how far we put the outliers it is often the case that PCA does better when scaled, but tSNE performs better when using the original data.
This comes from the scikit-learn example
In [3]:
from IPython.display import Image
outliers_fraction = 0.15
classifiers = {"robust covariance estimator": EllipticEnvelope(contamination=outliers_fraction)}
def make_subplot_again(X,covs,covariate,ax,pcX=0,pcY=1,fontSize=10,fontName='sans serif',ms=20,leg=True,title=None):
## variables
colors = ['k','cyan','r','orange','g','b','magenta']
clf_name = "robust covariance estimator"
clf = EllipticEnvelope(contamination=.1)
X =preprocessing.scale(X.copy())
## figure out scale
buff = 0.02
bufferX = buff * (X[:,pcX].max() - X[:,pcX].min())
bufferY = buff * (X[:,pcY].max() - X[:,pcY].min())
mm = [(X[:,pcX].min()-bufferX,X[:,pcX].max()+bufferX),(X[:,pcY].min()-bufferY,X[:,pcY].max()+bufferY)]
xx, yy = np.meshgrid(np.linspace(mm[0][0],mm[0][1], 500), np.linspace(mm[1][0],mm[1][1],500))
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,100 * outliers_fraction)
y_pred = y_pred > threshold
print y_pred
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
a = ax.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
ax.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
ax.axis('tight')
cvNames = np.sort(np.unique(covs[covariate]))
lines = []
for _i,i in enumerate(cvNames):
indices = np.where(covs[covariate]==i)[0]
s = ax.scatter(X[indices,pcX],X[indices,pcY],c=colors[_i],s=ms,label=covariate,alpha=0.9)
lines.append(s)
## axes
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(fontSize-2)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(fontSize-2)
ax.set_xlabel("D-%s"%str(pcX+1),fontsize=fontSize,fontname=fontName)
ax.set_ylabel("D-%s"%str(pcY+1),fontsize=fontSize,fontname=fontName)
plt.locator_params(axis='x',nbins=5)
ax.set_aspect(1./ax.get_data_ratio())
ax.set_xlim(mm[0])
ax.set_ylim(mm[1])
if title:
ax.set_title(title,fontsize=fontSize+2,fontname=fontName)
if leg:
legend = ax.legend(lines,cvNames,loc='upper right',scatterpoints=1,
handletextpad=0.01,labelspacing=0.01,borderpad=0.1,handlelength=1.0)
for label in legend.get_texts():
label.set_fontsize(fontSize-2)
label.set_fontname(fontName)
## make the figure again
plt.clf()
fig = plt.figure()
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
make_subplot_again(fit1,covs,'phenotype',ax1,pcX=0,pcY=1,leg=True,title='PCA-raw')
make_subplot_again(fit2,covs,'phenotype',ax2,pcX=0,pcY=1,leg=False,title='tSNE-raw')
make_subplot_again(fit3,covs,'phenotype',ax3,pcX=0,pcY=1,leg=False,title='PCA-scaled')
make_subplot_again(fit4,covs,'phenotype',ax4,pcX=0,pcY=1,leg=False,title='tSNE-scaled')
ax1.set_xlabel("")
ax2.set_xlabel("")
ax2.set_ylabel("")
ax4.set_ylabel("")
plt.subplots_adjust(hspace=0.3,wspace=0.05)
plt.savefig("outliers-detection.png",dpi=600)
In [6]:
Image(filename='outliers-detection.png')
Out[6]:
In [ ]: