In [1]:
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
# plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
# use seaborn plotting style defaults
doInteractive = True
def interactivePlots(fig, axes):
# helper function to decide to use plotly interactive plots or not
if(doInteractive):
plotly.offline.iplot_mpl(fig, show_link=False, strip_style=True) # offline ipython notebook
In [2]:
# Iris data
from sklearn import datasets
data = datasets.load_iris()
labels = np.array(['0']*50 + ['1']*50 + ['2']*50)
d = pd.DataFrame(data.data, columns = ['1','2','3','4'], index = labels)
In [3]:
print('data shape\n', d.shape)
d.head()
Out[3]:
In [4]:
def do_pca(X, n_components=10, centered=True):
'''
Return:
f: ratio of explained variance for each PC
Wt: PCA loadings for each features
Y: projected data in PCA space
'''
d = X
n_features = X.shape[1]
if n_components > n_features:
n_components = n_features
if isinstance(X, pd.DataFrame):
X = X.values
if centered:
X = X.astype('float') # Since X is object
X = X - X.mean(0)
X = X/X.std(0)
pca = PCA(n_components=n_components)
pca.fit(X)
fracs = pca.explained_variance_ratio_
F = [float("{0:.4f}".format(i)) for i in fracs]
Y = pca.fit_transform(X) # n_sample, n_redim
Wt = pca.components_ # n_redim, n_feature
# construct three DataFrames for later use:
# PCA loadings; % of variance explained for each PC; projected data on PCs
Wt_df = pd.DataFrame(Wt, index=['PC' + str(i) for i in np.arange(1, Wt.shape[0]+1)], columns=d.columns.values)
F_df = pd.DataFrame([str(f*100) + '%' for f in F], index=Wt_df.index, columns=['var. %'])
Data_proj = pd.DataFrame(Y, index=d.index, columns=['PC'+str(i) for i in np.arange(1, Y.shape[1]+1)])
return F_df, Wt_df, Data_proj, fracs
F, Wt, X_proj, fracs = do_pca(d)
X_proj = X_proj/abs(X_proj).max().max()
In [5]:
fig=plt.figure(figsize=(5, 5))
plt.plot(np.cumsum(fracs))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
interactivePlots(fig, None)
In [6]:
PC1 = 0
PC2 = 1
# Create a trace
traceCelltype = list()
labelList = np.unique(labels)
assert set(np.unique(labels)) == set(labelList), 'Missing labels'
for lbl in labelList:
traceCelltype.append(go.Scatter(
x = X_proj[labels==lbl].iloc[:,PC1],
y = X_proj[labels==lbl].iloc[:,PC2],
mode='markers',
name = lbl,
text = lbl
))
iplot(traceCelltype, filename='GuoPCA')
In [7]:
# 3D plot
PC1 = 0
PC2 = 1
PC3 = 2
traceCelltype3d = list()
layout = go.Layout(showlegend=True, title='Scatter plot of 3-D PCA space', annotations=list())
for lbl in labelList:
traceCelltype3d.append(go.Scatter3d(
x = X_proj[labels==lbl].iloc[:,PC1],
y = X_proj[labels==lbl].iloc[:,PC2],
z = X_proj[labels==lbl].iloc[:,PC3],
mode='markers',
name = lbl,
text = lbl
))
fig = go.Figure(data=traceCelltype3d, layout=layout)
iplot(fig, filename='GuoPCA3D')
In [8]:
# Calculate vector magnitudes for each gene
GenePCMagnitudes = Wt.loc['PC1',:]**2+Wt.loc['PC2',:]
GenePCMagnitudes.sort_values(inplace=True, ascending=False)
# plot principal components
def GetPCVectorsForEachGene(fLine=False, topN=None):
tracePCs = list()
layout = go.Layout(showlegend=True, title='Top principal components', annotations=list())
for i, g in enumerate(GenePCMagnitudes.iteritems()): # plot feature by feature
if(topN is not None and i >= topN):
continue
if(fLine):
xp = [0, Wt.loc['PC1', g[0]]* 2.5] # x1.5 to make it more visible
yp = [0, Wt.loc['PC2', g[0]]* 2.5]
mode='lines'
layout.annotations.append(
dict(
x=xp[1],
y=yp[1],
xref='x',
yref='y',
text=g[0]+'_PC',
showarrow=True,
arrowhead=7,
ax=0,
ay=-15
)
)
else:
xp = [Wt.loc['PC1', g[0]]* 2.5] # x1.5 to make it more visible
yp = [Wt.loc['PC2', g[0]]* 2.5]
mode='markers+text'
tracePCs.append(go.Scatter(
x = xp,
y = yp,
mode=mode,
name = g[0]+'_PC',
text = g[0]+'_PC',
marker={'size': 10},
line = {'width': 10},
textposition='top'
))
return tracePCs, layout
t, l = GetPCVectorsForEachGene(fLine=True, topN=4)
fig = go.Figure(data=t, layout=l)
iplot(fig, filename='GuoPCs')
In [9]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
def plotinteractPCA(gene='Atp12a', showPCs=1, linePCs=False, PC1=0, PC2=1):
layout = go.Layout(showlegend=True, title='Interactive PCA plot',
annotations=list(), legend={'orientation': 'h'})
traceGene = list()
if(gene == 'celltype'):
for lbl in labelList:
traceGene.append(go.Scatter(
x = X_proj[labels==lbl].iloc[:,PC1],
y = X_proj[labels==lbl].iloc[:,PC2],
mode='markers',
name = lbl,
text = lbl
))
else:
traceGene.append(go.Scatter(
x = X_proj.iloc[:,PC1],
y = X_proj.iloc[:,PC2],
mode='markers',
marker = {'color': d[gene].values, 'size': 10, 'colorscale': 'Viridis',
'colorbar':{'title': 'expression'}},
text = labels,
name = 'Gene Expression'
))
if(showPCs > 0):
t, l = GetPCVectorsForEachGene(topN=showPCs, fLine=linePCs)
layout.annotations += l.annotations
traceGene += t
fig = go.Figure(data=traceGene, layout=layout)
iplot(fig, filename='GuoPCAInteractive')
_=interact(plotinteractPCA, gene=['celltype']+list(d.columns.values),
showPCs=(0, d.shape[1]), linePCs=True, PC1=(0, 3), PC2=(1, 3))
In [ ]: