Dimension Reduction and Visualization (single cell as bulk & bulk)



In [1]:

    
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Import packages for dimension reduction
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Import interactive modules
from ipywidgets import interact,interactive,fixed
from IPython.display import display
# Import stats for Fisher exact p-value calculation
import scipy.stats as stats



In [2]:

    
# Read in the DGE data sheet
inFile = './AGGallSamp_wBulkNorm_logged_widemtx.csv'
subset = pd.read_table(inFile, sep=',',
                                     # Sets the first (Python starts counting from 0 not 1) column as the row names
                                     index_col=0,

                                     # Tells pandas to decompress the gzipped file if required
                                     compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe. Should be cells as indexes, genes as column names
# Transpose by subset = subset.T if required.
print(subset.shape)
subset.head()









    



/bar/yhou/anaconda3/envs/sca/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    



(16180, 18830)






    Out[2]:







  
    
      
      ENSDARG00000100660
      ENSDARG00000100422
      ENSDARG00000102128
      ENSDARG00000103095
      ENSDARG00000102226
      ENSDARG00000104049
      ENSDARG00000102474
      ENSDARG00000100143
      ENSDARG00000104373
      ENSDARG00000098311
      ...
      ENSDARG00000078600
      ENSDARG00000097721
      ENSDARG00000051761
      ENSDARG00000093480
      ENSDARG00000089892
      ENSDARG00000051756
      ENSDARG00000102643
      ENSDARG00000103220
      ENSDARG00000101560
      REP1
    
  
  
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      2.321928
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1.584963
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
    
    
      5
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.584963
      1.0
    
  

5 rows × 18830 columns



In [3]:

    
# Should have specified data type for indexes. convert all indexes to string type
subset.index = [str(index) for index in subset.index]



In [4]:

    
# Read in metadata
metadata = pd.read_table("./AGGallSamp_wBulk_metadata.csv", sep = ',', index_col=0)
print(metadata.shape)
metadata.head()









    



(18781, 6)






    Out[4]:







  
    
      
      Barcodes
      SampleID
      Stage
      Replicate
      Type
      GFP
    
  
  
    
      1
      AAACCTGAGAGTCGGT-1
      1
      0dpa
      1
      SC
      False
    
    
      2
      AAACCTGCAAATCCGT-1
      1
      0dpa
      1
      SC
      False
    
    
      3
      AAACCTGGTATAATGG-1
      1
      0dpa
      1
      SC
      False
    
    
      4
      AAACCTGTCAGCTTAG-1
      1
      0dpa
      1
      SC
      False
    
    
      5
      AAACCTGTCATATCGG-1
      1
      0dpa
      1
      SC
      True



In [ ]:

    
## Transpose matrix
# subset = subset.T
# subset.head()

Prior smushing: Log transformation?



In [ ]:

    
# Is the data highly skewed? 
## Can be roughly estimated by: 
for i in range(1,2500,250):
    sns.distplot(subset.iloc[i][subset.iloc[i] > 1], )
#if yes:
## whole dataframe log transformation
# loggedsubset = np.log2(subset + 1)
# loggedsubset.head()



In [ ]:

    
# Distribution after log transformation
# for i in range(1,2500,250):
#     sns.distplot(loggedsubset.iloc[i][loggedsubset.iloc[i] > 1], )

For single cell and bulk combined wide mtx and metadata



In [5]:

    
# Fit metadata to mtx shape

## Index has different data type, so are joined incorrectly...
## Reloading with low_memory option kills the kernel.
## Instead, try turning all index to string before combining (joining)

# Join data subset with metadata, keep the intersected metadata
subsetWmeta = subset.join(metadata)
fitMeta = subsetWmeta[list(metadata.columns)]
print("Metadata with fitted shape of: ", fitMeta.shape)









    



Metadata with fitted shape of:  (16180, 6)



In [6]:

    
# Sample ID to meta extractor
dict_sampIDtoMeta = {
    "1": ["0dpa","1"],
    "2": ["0dpa","2"],
    "3": ["1dpa","1"],
    "4": ["1dpa","2"],
    "5": ["2dpa","1"],
    "6": ["2dpa","2"],
    "7": ["4dpa","1"],
    "8": ["4dpa","2"],
                    }



In [ ]:

    
# Extract bulk data subset and save as a new dataframe
bulkMask = fitMeta["Type"] == "Bulk"
asBulkDF = subset[bulkMask]
# Loop through all possible samples (1-8)
for i in range(1,9):
    # Make mask
    sampID = str(i)
    sampGPMask1 = fitMeta["SampleID"] == i 
    sampGPMask2 = fitMeta["GFP"] == True
    sampGPMask = [all(tup) for tup in zip(sampGPMask1,sampGPMask2)]
    sampGNMask1 = fitMeta["SampleID"] == i 
    sampGNMask2 = fitMeta["GFP"] == False
    sampGNMask = [all(tup) for tup in zip(sampGNMask1,sampGNMask2)]
    # Extract sample subset
    sampGPSubset = subset[sampGPMask]
    sampGNSubset = subset[sampGNMask]
    # Calculate mean for all cells within sample
    sampGPMean = sampGPSubset.mean()
    sampGNMean = sampGNSubset.mean()
    # Rename the mean array
    sampGPName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPpo"
    sampGPMean = sampGPMean.rename(sampGPName)
    sampGNName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPne"
    sampGNMean = sampGNMean.rename(sampGNName)
    # Append calculated mean to bulk matrix
    asBulkDF = asBulkDF.append(sampGPMean)
    asBulkDF = asBulkDF.append(sampGNMean)
    # report the shape of appended matrix
    print("Current matrix shape: ", asBulkDF.shape)
# mean only dataframe?
asBulkDF.head()



In [ ]:

    
# Adjust all sample total count to 3000?
## Add one column to save the sum of each sample
asBulkDF["SUM"] = asBulkDF.sum(axis=1)
## Transpose for easier calculation
asBulkD = asBulkDF.T
## For every value in frame
normAsBulkDF = asBulkD.divide(asBulkD.iloc[18830] / 3000)
normAsBulkDF



In [ ]:

    
# Transpose back and drop SUM column
DFforPCA = normAsBulkDF.T.drop(["SUM"],axis=1)
print("Shape of matrix for PCA: ",DFforPCA.shape)
DFforPCA.head()



In [ ]:

    
# Estimate percentage of variance can be explained by principal components:
sns.set()
pca = PCA().fit(DFforPCA)
plt.figure(figsize=(8,6),dpi=300)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,12)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')



In [ ]:

    
# Use the number of PCs determined to transform data
pca_tsfed = pca = PCA(n_components=4).fit_transform(DFforPCA)
print(pca_tsfed.shape)
# Plot samples by transformed components
fig, ax = plt.subplots(figsize=(8,6),dpi=300)
ax.scatter(pca_tsfed[:, 1], pca_tsfed[:, 2],
            c=["b","g","r","c","b","g","r","c","m","y","m","y","k","b","k","b","g","r","g","r","c","m","c","m"], edgecolor='none', alpha=0.5)
name = ["B4P","B4N","B0P","B0N","B4P","B4N","B0P","B0N","SC0P","SC0N","SC0P","SC0N","SC1P","SC1N","SC1P","SC1N","SC2P","SC2N","SC2P","SC2N","SC4P","SC4N","SC4P","SC4N"]
#name = DFforPCA.index
for i, txt in enumerate(name):
    ax.annotate(txt, (pca_tsfed[:, 1][i], pca_tsfed[:, 2][i]))
ax.set(xlabel='component 2', ylabel='component 3')



In [ ]:



In [ ]:



In [ ]:

	ENSDARG00000103095	ENSDARG00000104049	...	ENSDARG00000102643	ENSDARG00000101560	REP1
1	0.0	1.0	...	1.0	2.321928	0.0
2	0.0	0.0	...	0.0	0.000000	0.0
3	0.0	0.0	...	1.0	1.584963	0.0
4	0.0	0.0	...	0.0	0.000000	0.0
5	2.0	0.0	...	0.0	1.584963	1.0

	Barcodes	SampleID	Stage	Replicate	Type	GFP
1	AAACCTGAGAGTCGGT-1	1	0dpa	1	SC	False
2	AAACCTGCAAATCCGT-1	1	0dpa	1	SC	False
3	AAACCTGGTATAATGG-1	1	0dpa	1	SC	False
4	AAACCTGTCAGCTTAG-1	1	0dpa	1	SC	False
5	AAACCTGTCATATCGG-1	1	0dpa	1	SC	True