Dimension Reduction and Visualization

Ways to reduce the dimension:
1. Principal Component Analysis
2. Independent Component Analysis
3. NMF
Visualization post dimensional reduction:
1. tSNE
2. MDS



In [23]:

    
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Import packages for dimension reduction
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Import interactive modules
from ipywidgets import interact,interactive,fixed
from IPython.display import display
# Import stats for Fisher exact p-value calculation
import scipy.stats as stats



In [10]:

    
# Read in the DGE data sheet
inFile = 'Test_filtered.mtx'
subset = pd.read_table(inFile, sep=',',
                               
                                     # Sets the first (Python starts counting from 0 not 1) column as the row names
                                     index_col=0)

                                     # Tells pandas to decompress the gzipped file if required
                                     # ,compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe
print(subset.shape)
subset.head()









    



(135, 3185)






    Out[10]:







  
    
      
      ACBD3
      AL928854.1
      AL954715.2
      ARHGEF12
      BICD2
      BX004816.3
      BX284638.1
      BX321875.1
      BX649250.1
      BX664625.3
      ...
      znf654
      znf706
      znf750
      znf865
      znfx1
      zranb2
      zswim5
      zswim8
      zw10
      zyg11
    
  
  
    
      ATCTTGGCGGCG
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      4
      0
    
    
      ATAGAGTCGCAT
      1
      0
      0
      0
      0
      4
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      2
      1
      2
      1
      0
      0
    
    
      GCGCACGTTGAG
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      1
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      TTCACGGCTGAG
      0
      0
      1
      0
      0
      0
      2
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      GCGAGACACAGC
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
  

5 rows × 3185 columns

This notebook needs rearrangement!!!

PCA, ICA, NMF results should also be plotted.

PCA/ICA/NMF -> tSNE



In [11]:

    
# Estimate percentage of variance can be explained by principal components:
sns.set()
pca = PCA().fit(subset)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')









    Out[11]:





<matplotlib.text.Text at 0x11341e198>



In [12]:

    
# Combine dimension reduction and tSNE. 
# Allow user to adjust the number of principal components and the perplexity of tSNE.
# Plot interactively.
## Define corresponding func for each of the dimension reduction approaches.
def PCAsm(data, n, max_it):
    name = "PCA"
    pcaer = PCA(n_components=n)
    pcad = pcaer.fit_transform(data)
    df = pd.DataFrame(pcad, index=data.index)
    print("Shape of dimension reduced matrix is:", pcad.shape)
    return df,name

def FastICAsm(data, n, max_it):
    name = "ICA"
    icaer = FastICA(n_components=n, max_iter=max_it)
    icad = icaer.fit_transform(data)
    print("Shape of dimension reduced matrix is:", icad.shape)
    df = pd.DataFrame(icad, index=data.index)
    return df,name

def NMFsm(data, n, max_it):
    name = "NMF"
    nmfer = NMF(n_components=n,max_iter=max_it)
    nmfd = nmfer.fit_transform(data)
    print("Shape of dimension reduced matrix is:", nmfd.shape)
    df = pd.DataFrame(nmfd, index=data.index)
    return df,name

## Define the main func to take in all adjustable variables
## Smush, save the new dataframe in df, and plot the tSNE plot.
def f(data, sm, n = 10,perp = 30, max_it = 600, it = 1000):
    """
    Reduce the dimension of original dataset via PCA, ICA or NMF as chosen by user.
    The output dataframe will be used as input of tSNE.
    Upon activation by interact(), user can choose their desired sm, n, perp, max_it, and it. 
    In current version, tSNE is using 'Manhattan' as the distance matrix and working at learning rate of 1000.
    
    Args:
        sm: str. Name of smusher. Choose from PCA, ICA and NMF
        n: int. Initial number of dimensions. The number of dimensions of smusher processed matrix before tSNE
        perp: int. Perplexity level of tSNE
        max_it: int. Maximum number of iterations in ICA or NMF
        it: int. Maximum number of iterations in tSNE
    """
    df, name = sm(data, n, max_it)
    print("Perplexity level:", perp)
    tsner = TSNE(perplexity=perp,n_iter=it,metric='manhattan')
    tsned = tsner.fit_transform(df)
    plt.figure(figsize=(3,3))
    plt.scatter(tsned[:,0],tsned[:,1], 10)
    plt.xlabel("tSNE1",fontsize=16)
    plt.ylabel("tSNE2",fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    title = name + ":" + str(n) + " initial dimensions, " + "perplexity level at " + str(perp) + ", " + str(it) + " iterations"
    plt.title(title)



In [6]:

    
# Set an interactive tSNE plot canvas
interact(f, data = fixed(subset), n=(10,100,10), perp = (5,50), it = (500,5000,500), max_it = (200,1000,200),sm={"PCA":PCAsm, "ICA":FastICAsm, "NMF":NMFsm})









    





 
 










    Out[6]:





<function __main__.f>

Independent Component Analysis & Component-specific cell cycle genes



In [13]:

    
# Run ICA and recover the transformed matrix and key components
icaer = FastICA(n_components=6, max_iter=600)
icad = icaer.fit_transform(subset)
print("Shape of transformed ICA matrix is: ", icad.shape)









    



Shape of transformed ICA matrix is:  (135, 6)



In [14]:

    
# Get component composition
icaf = icaer.fit(subset)
components = icaf.components_



In [15]:

    
# For each component, find the key contributors.
## Calculate absolute values of the faction factor
## Set an empty dictionary with component index as key and the feature list as value
ICA_comp_dict = {}
for i in range(6):
    # Get composition of one component
    array = pd.DataFrame(data = components).loc[i]
    # Turn negative value to positive
    abs_array = [abs(x) for x in array]
    # Set filtering threshold to 90% quantile
    thres = np.percentile(abs_array, 90)
    # Make the boolean mask
    mask = abs_array > thres
    # Obtain genes with faction over the threshold
    comp_gene_list = subset.columns[mask]
    print(comp_gene_list.shape)
    gene_list_upper = [x.upper() for x in comp_gene_list]
    # Save feature list into dictionary
    key = "Component" + str(i+1)
    ICA_comp_dict[key] = gene_list_upper    

#(pd.DataFrame(data = components).loc[0] > 0.0001).sum()









    



(319,)
(319,)
(319,)
(319,)
(319,)
(319,)



In [18]:

    
# Capitalize all genes in subset
subset.columns = [x.upper() for x in subset.columns]
subset.columns









    Out[18]:





Index(['ACBD3', 'AL928854.1', 'AL954715.2', 'ARHGEF12', 'BICD2', 'BX004816.3',
       'BX284638.1', 'BX321875.1', 'BX649250.1', 'BX664625.3',
       ...
       'ZNF654', 'ZNF706', 'ZNF750', 'ZNF865', 'ZNFX1', 'ZRANB2', 'ZSWIM5',
       'ZSWIM8', 'ZW10', 'ZYG11'],
      dtype='object', length=3185)



In [44]:

    
sns.distplot(pd.DataFrame(data = components).loc[5])









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x11718b9e8>



In [75]:

    
# set.intersection(<sets>) -> find the overlapping terms in the genes



In [3]:

    
# Gene ontology enrichment
## Original proportion of cell cycle genes in the complete dataset
## Read in zebrafish cell cycle genes
ccycle_gene_mtx = pd.read_csv("Test_cellcycle_info_merged.txt",index_col=0)
ccycle_gene_mtx.head()









    Out[3]:







  
    
      
      GCCAAATGCACG
      AGCGGTGGCAGC
      TGAACCCCAGCT
      ATCTTGGCGGCG
      GTTCCTACAACA
      ATAGAGTCGCAT
      GCGCACGTTGAG
      TTCACGGCTGAG
      GCGAGACACAGC
      CGAGTCGGTAGC
      ...
      AGAATCGAGCCT
      GTAGATCATCTC
      TAATGAAGTGGA
      CTCGGGATATGT
      GCTAAGGTTGTT
      AAGTTCAAGGGG
      GGTCATGGTTCG
      Sum
      Upper name.1
      Stages
    
    
      Upper name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      CABZ01066926.1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      5
      CABZ01066926.1
      G1/S
    
    
      FOXK2
      0
      0
      0
      0
      0
      1
      0
      2
      2
      0
      ...
      0
      1
      0
      0
      0
      0
      0
      37
      FOXK2
      M/G1
    
    
      FOXK2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      FOXK2
      M/G1
    
    
      MZT1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      11
      MZT1
      M
    
    
      PHTF1
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      17
      PHTF1
      S
    
  

5 rows × 503 columns



In [8]:

    
## Extract only gene name and stage info from this dataframe
ccycle_genes = ccycle_gene_mtx[['Upper name.1','Stages']]
ccycle_gene = ccycle_genes.drop_duplicates()
ccycle_gene.columns = ['Zebrafish gene name', 'Stages']



In [22]:

    
## Calculate backround frequency
stage_bg = {}
for stage in ["G1/S","S","G2/M","M","M/G1"]:
    stage_df = ccycle_gene[ccycle_gene['Stages'] == stage]
    print("Shape of stage {}-specific sub-matrix is: {}".format(stage,stage_df.shape))
    i = 0
    for gene in stage_df['Zebrafish gene name']:
        if gene in subset.columns:
            i += 1    
    #ratio = i / len(subset.columns)
    stage_bg[stage] = i
    print("There are {} genes of stage {} in the subset background.".format(i,stage))
stage_bg









    



Shape of stage G1/S-specific sub-matrix is: (73, 2)
There are 24 genes of stage G1/S in the subset background.
Shape of stage S-specific sub-matrix is: (76, 2)
There are 22 genes of stage S in the subset background.
Shape of stage G2/M-specific sub-matrix is: (101, 2)
There are 29 genes of stage G2/M in the subset background.
Shape of stage M-specific sub-matrix is: (119, 2)
There are 45 genes of stage M in the subset background.
Shape of stage M/G1-specific sub-matrix is: (87, 2)
There are 47 genes of stage M/G1 in the subset background.






    Out[22]:





{'G1/S': 24, 'G2/M': 29, 'M': 45, 'M/G1': 47, 'S': 22}



In [20]:

    
## Calculate number of genes are overlapping with cell cycle genes for each component
stage_spec_gene_count_dict = {}
## Loop through list of component
for comp in sorted(ICA_comp_dict.keys()):
    # Set count list default to zero
    comp_ccycle_count_list = [0,0,0,0,0]
    # Loop through the stages 
    for i,stage in zip(range(5),["G1/S","S","G2/M","M","M/G1"]):
        stage_df = ccycle_gene[ccycle_gene['Stages'] == stage]
        # Loop through genes for one component
        for gene in ICA_comp_dict[comp]:
            # Count number of cell cycle related genes in this component
            if gene in stage_df['Zebrafish gene name']:
                comp_ccycle_count_list[i] += 1
    # Save gene count to dictionary
    stage_spec_gene_count_dict[comp] = comp_ccycle_count_list

stage_spec_gene_count_dict









    Out[20]:





{'Component1': [0, 1, 1, 0, 2],
 'Component2': [0, 1, 2, 1, 4],
 'Component3': [2, 1, 3, 3, 8],
 'Component4': [0, 0, 1, 1, 0],
 'Component5': [0, 0, 1, 4, 4],
 'Component6': [1, 0, 1, 2, 4]}



In [25]:

    
len(subset.columns)









    Out[25]:





3185



In [33]:

    
# Calculate the ratio of cell cycle genes in each component
# Set two empty dicts, one for oddsratio of enrichment for each components, and the other for p-value
ICA_comp_odds_dict = {}
ICA_comp_pval_dict = {}

# Loop through list of components
for comp in sorted(stage_spec_gene_count_dict.keys()):
    odds_list = []
    pval_list = []
    # Loop through index and their corresponding stages 
    for i,stage in zip(range(5),["G1/S","S","G2/M","M","M/G1"]):
        # Save number of cell cycle gene in background, and that in specific component in variable.
        ccbg = int(stage_bg[stage])
        cccomp = int(stage_spec_gene_count_dict[comp][i])
        # Use scipy stats to calculate fisher's exact p-value
        odds, pval = stats.fisher_exact([[ccbg, cccomp],[3185-ccbg, 319-cccomp]])
        # Append oddsratio and pvalue for each stage to a list
        odds_list.append(odds)
        pval_list.append(pval)
    # Save the oddsratio/pvalue list as value for each component (key)
    ICA_comp_odds_dict[comp] = odds_list
    ICA_comp_pval_dict[comp] = pval_list
pd.DataFrame.from_dict(ICA_comp_odds_dict, orient='index')

Multi-Dimensional Scaling

Work directly on the raw matrix



In [73]:

    
mdsed2 = smusher2.fit_transform(subset)
print(mdsed2.shape)

plt.figure(figsize=(4,4))
plt.scatter(mdsed2[:,0],mdsed2[:,1], 10)
plt.xlabel("Dimension1",fontsize=16)
plt.ylabel("Dimension2",fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)









    



(135, 2)






    Out[73]:





(array([-200., -150., -100.,  -50.,    0.,   50.,  100.,  150.,  200.,
         250.,  300.]), <a list of 11 Text yticklabel objects>)

Work on PCA-treated data (?)



In [7]:

    
smusher2 = MDS()
mdsed = smusher2.fit_transform(pcad_df)
print(mdsed.shape)



In [ ]:

    
plt.figure(figsize=(8,8))
plt.scatter(mdsed[:,0],mdsed[:,1], 10)
plt.xlabel("Dimension1",fontsize=16)
plt.ylabel("Dimension2",fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

	0	1	2	3	4
Component1	inf	2.211824	2.922053	inf	2.373964
Component2	inf	2.211824	1.456432	4.557325	1.179493
Component3	1.203417	2.211824	0.967892	1.509554	0.582258
Component4	inf	inf	2.922053	4.557325	inf
Component5	inf	inf	2.922053	1.128583	1.179493
Component6	2.414426	inf	2.922053	2.271497	1.179493

	ACBD3	AL928854.1	AL954715.2	BICD2	BX004816.3	BX284638.1	BX321875.1	BX664625.3	...	znf706	znf750	znf865	znfx1	zranb2	zswim5	zswim8	zw10
ATCTTGGCGGCG	1	0	0	0	0	0	0	0	...	0	0	0	0	0	1	0	4
ATAGAGTCGCAT	1	0	0	0	4	0	1	0	...	0	0	0	2	1	2	1	0
GCGCACGTTGAG	0	1	0	1	0	0	0	0	...	1	1	0	0	0	1	0	0
TTCACGGCTGAG	0	0	1	0	0	2	0	0	...	0	0	0	0	0	1	0	0
GCGAGACACAGC	0	0	0	0	0	0	0	1	...	0	0	1	0	0	0	0	0

	GCCAAATGCACG	AGCGGTGGCAGC	TGAACCCCAGCT	ATCTTGGCGGCG	GTTCCTACAACA	ATAGAGTCGCAT	GCGCACGTTGAG	TTCACGGCTGAG	GCGAGACACAGC	CGAGTCGGTAGC	...	AGAATCGAGCCT	GTAGATCATCTC	TAATGAAGTGGA	CTCGGGATATGT	GCTAAGGTTGTT	AAGTTCAAGGGG	GGTCATGGTTCG	Sum	Upper name.1	Stages
Upper name
CABZ01066926.1	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	5	CABZ01066926.1	G1/S
FOXK2	0	0	0	0	0	1	0	2	2	0	...	0	1	0	0	0	0	0	37	FOXK2	M/G1
FOXK2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	2	FOXK2	M/G1
MZT1	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	11	MZT1	M
PHTF1	0	2	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	17	PHTF1	S