In [23]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Import packages for dimension reduction
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Import interactive modules
from ipywidgets import interact,interactive,fixed
from IPython.display import display
# Import stats for Fisher exact p-value calculation
import scipy.stats as stats
In [10]:
# Read in the DGE data sheet
inFile = 'Test_filtered.mtx'
subset = pd.read_table(inFile, sep=',',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
# Tells pandas to decompress the gzipped file if required
# ,compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe
print(subset.shape)
subset.head()
Out[10]:
In [11]:
# Estimate percentage of variance can be explained by principal components:
sns.set()
pca = PCA().fit(subset)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
Out[11]:
In [12]:
# Combine dimension reduction and tSNE.
# Allow user to adjust the number of principal components and the perplexity of tSNE.
# Plot interactively.
## Define corresponding func for each of the dimension reduction approaches.
def PCAsm(data, n, max_it):
name = "PCA"
pcaer = PCA(n_components=n)
pcad = pcaer.fit_transform(data)
df = pd.DataFrame(pcad, index=data.index)
print("Shape of dimension reduced matrix is:", pcad.shape)
return df,name
def FastICAsm(data, n, max_it):
name = "ICA"
icaer = FastICA(n_components=n, max_iter=max_it)
icad = icaer.fit_transform(data)
print("Shape of dimension reduced matrix is:", icad.shape)
df = pd.DataFrame(icad, index=data.index)
return df,name
def NMFsm(data, n, max_it):
name = "NMF"
nmfer = NMF(n_components=n,max_iter=max_it)
nmfd = nmfer.fit_transform(data)
print("Shape of dimension reduced matrix is:", nmfd.shape)
df = pd.DataFrame(nmfd, index=data.index)
return df,name
## Define the main func to take in all adjustable variables
## Smush, save the new dataframe in df, and plot the tSNE plot.
def f(data, sm, n = 10,perp = 30, max_it = 600, it = 1000):
"""
Reduce the dimension of original dataset via PCA, ICA or NMF as chosen by user.
The output dataframe will be used as input of tSNE.
Upon activation by interact(), user can choose their desired sm, n, perp, max_it, and it.
In current version, tSNE is using 'Manhattan' as the distance matrix and working at learning rate of 1000.
Args:
sm: str. Name of smusher. Choose from PCA, ICA and NMF
n: int. Initial number of dimensions. The number of dimensions of smusher processed matrix before tSNE
perp: int. Perplexity level of tSNE
max_it: int. Maximum number of iterations in ICA or NMF
it: int. Maximum number of iterations in tSNE
"""
df, name = sm(data, n, max_it)
print("Perplexity level:", perp)
tsner = TSNE(perplexity=perp,n_iter=it,metric='manhattan')
tsned = tsner.fit_transform(df)
plt.figure(figsize=(3,3))
plt.scatter(tsned[:,0],tsned[:,1], 10)
plt.xlabel("tSNE1",fontsize=16)
plt.ylabel("tSNE2",fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
title = name + ":" + str(n) + " initial dimensions, " + "perplexity level at " + str(perp) + ", " + str(it) + " iterations"
plt.title(title)
In [6]:
# Set an interactive tSNE plot canvas
interact(f, data = fixed(subset), n=(10,100,10), perp = (5,50), it = (500,5000,500), max_it = (200,1000,200),sm={"PCA":PCAsm, "ICA":FastICAsm, "NMF":NMFsm})
Out[6]:
In [13]:
# Run ICA and recover the transformed matrix and key components
icaer = FastICA(n_components=6, max_iter=600)
icad = icaer.fit_transform(subset)
print("Shape of transformed ICA matrix is: ", icad.shape)
In [14]:
# Get component composition
icaf = icaer.fit(subset)
components = icaf.components_
In [15]:
# For each component, find the key contributors.
## Calculate absolute values of the faction factor
## Set an empty dictionary with component index as key and the feature list as value
ICA_comp_dict = {}
for i in range(6):
# Get composition of one component
array = pd.DataFrame(data = components).loc[i]
# Turn negative value to positive
abs_array = [abs(x) for x in array]
# Set filtering threshold to 90% quantile
thres = np.percentile(abs_array, 90)
# Make the boolean mask
mask = abs_array > thres
# Obtain genes with faction over the threshold
comp_gene_list = subset.columns[mask]
print(comp_gene_list.shape)
gene_list_upper = [x.upper() for x in comp_gene_list]
# Save feature list into dictionary
key = "Component" + str(i+1)
ICA_comp_dict[key] = gene_list_upper
#(pd.DataFrame(data = components).loc[0] > 0.0001).sum()
In [18]:
# Capitalize all genes in subset
subset.columns = [x.upper() for x in subset.columns]
subset.columns
Out[18]:
In [44]:
sns.distplot(pd.DataFrame(data = components).loc[5])
Out[44]:
In [75]:
# set.intersection(<sets>) -> find the overlapping terms in the genes
In [3]:
# Gene ontology enrichment
## Original proportion of cell cycle genes in the complete dataset
## Read in zebrafish cell cycle genes
ccycle_gene_mtx = pd.read_csv("Test_cellcycle_info_merged.txt",index_col=0)
ccycle_gene_mtx.head()
Out[3]:
In [8]:
## Extract only gene name and stage info from this dataframe
ccycle_genes = ccycle_gene_mtx[['Upper name.1','Stages']]
ccycle_gene = ccycle_genes.drop_duplicates()
ccycle_gene.columns = ['Zebrafish gene name', 'Stages']
In [22]:
## Calculate backround frequency
stage_bg = {}
for stage in ["G1/S","S","G2/M","M","M/G1"]:
stage_df = ccycle_gene[ccycle_gene['Stages'] == stage]
print("Shape of stage {}-specific sub-matrix is: {}".format(stage,stage_df.shape))
i = 0
for gene in stage_df['Zebrafish gene name']:
if gene in subset.columns:
i += 1
#ratio = i / len(subset.columns)
stage_bg[stage] = i
print("There are {} genes of stage {} in the subset background.".format(i,stage))
stage_bg
Out[22]:
In [20]:
## Calculate number of genes are overlapping with cell cycle genes for each component
stage_spec_gene_count_dict = {}
## Loop through list of component
for comp in sorted(ICA_comp_dict.keys()):
# Set count list default to zero
comp_ccycle_count_list = [0,0,0,0,0]
# Loop through the stages
for i,stage in zip(range(5),["G1/S","S","G2/M","M","M/G1"]):
stage_df = ccycle_gene[ccycle_gene['Stages'] == stage]
# Loop through genes for one component
for gene in ICA_comp_dict[comp]:
# Count number of cell cycle related genes in this component
if gene in stage_df['Zebrafish gene name']:
comp_ccycle_count_list[i] += 1
# Save gene count to dictionary
stage_spec_gene_count_dict[comp] = comp_ccycle_count_list
stage_spec_gene_count_dict
Out[20]:
In [25]:
len(subset.columns)
Out[25]:
In [33]:
# Calculate the ratio of cell cycle genes in each component
# Set two empty dicts, one for oddsratio of enrichment for each components, and the other for p-value
ICA_comp_odds_dict = {}
ICA_comp_pval_dict = {}
# Loop through list of components
for comp in sorted(stage_spec_gene_count_dict.keys()):
odds_list = []
pval_list = []
# Loop through index and their corresponding stages
for i,stage in zip(range(5),["G1/S","S","G2/M","M","M/G1"]):
# Save number of cell cycle gene in background, and that in specific component in variable.
ccbg = int(stage_bg[stage])
cccomp = int(stage_spec_gene_count_dict[comp][i])
# Use scipy stats to calculate fisher's exact p-value
odds, pval = stats.fisher_exact([[ccbg, cccomp],[3185-ccbg, 319-cccomp]])
# Append oddsratio and pvalue for each stage to a list
odds_list.append(odds)
pval_list.append(pval)
# Save the oddsratio/pvalue list as value for each component (key)
ICA_comp_odds_dict[comp] = odds_list
ICA_comp_pval_dict[comp] = pval_list
pd.DataFrame.from_dict(ICA_comp_odds_dict, orient='index')
Out[33]:
In [73]:
mdsed2 = smusher2.fit_transform(subset)
print(mdsed2.shape)
plt.figure(figsize=(4,4))
plt.scatter(mdsed2[:,0],mdsed2[:,1], 10)
plt.xlabel("Dimension1",fontsize=16)
plt.ylabel("Dimension2",fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
Out[73]:
In [7]:
smusher2 = MDS()
mdsed = smusher2.fit_transform(pcad_df)
print(mdsed.shape)
In [ ]:
plt.figure(figsize=(8,8))
plt.scatter(mdsed[:,0],mdsed[:,1], 10)
plt.xlabel("Dimension1",fontsize=16)
plt.ylabel("Dimension2",fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)