Dimension Reduction and Visualization (single cell as bulk & bulk)


In [1]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Import packages for dimension reduction
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Import interactive modules
from ipywidgets import interact,interactive,fixed
from IPython.display import display
# Import stats for Fisher exact p-value calculation
import scipy.stats as stats

In [2]:
# Read in the DGE data sheet
inFile = './AGGallSamp_wBulkNorm_logged_widemtx.csv'
subset = pd.read_table(inFile, sep=',',
                                     # Sets the first (Python starts counting from 0 not 1) column as the row names
                                     index_col=0,

                                     # Tells pandas to decompress the gzipped file if required
                                     compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe. Should be cells as indexes, genes as column names
# Transpose by subset = subset.T if required.
print(subset.shape)
subset.head()


/bar/yhou/anaconda3/envs/sca/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
(16180, 18830)
Out[2]:
ENSDARG00000100660 ENSDARG00000100422 ENSDARG00000102128 ENSDARG00000103095 ENSDARG00000102226 ENSDARG00000104049 ENSDARG00000102474 ENSDARG00000100143 ENSDARG00000104373 ENSDARG00000098311 ... ENSDARG00000078600 ENSDARG00000097721 ENSDARG00000051761 ENSDARG00000093480 ENSDARG00000089892 ENSDARG00000051756 ENSDARG00000102643 ENSDARG00000103220 ENSDARG00000101560 REP1
1 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.321928 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.584963 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
5 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.584963 1.0

5 rows × 18830 columns


In [3]:
# Should have specified data type for indexes. convert all indexes to string type
subset.index = [str(index) for index in subset.index]

In [4]:
# Read in metadata
metadata = pd.read_table("./AGGallSamp_wBulk_metadata.csv", sep = ',', index_col=0)
print(metadata.shape)
metadata.head()


(18781, 6)
Out[4]:
Barcodes SampleID Stage Replicate Type GFP
1 AAACCTGAGAGTCGGT-1 1 0dpa 1 SC False
2 AAACCTGCAAATCCGT-1 1 0dpa 1 SC False
3 AAACCTGGTATAATGG-1 1 0dpa 1 SC False
4 AAACCTGTCAGCTTAG-1 1 0dpa 1 SC False
5 AAACCTGTCATATCGG-1 1 0dpa 1 SC True

In [ ]:
## Transpose matrix
# subset = subset.T
# subset.head()

Prior smushing: Log transformation?


In [ ]:
# Is the data highly skewed? 
## Can be roughly estimated by: 
for i in range(1,2500,250):
    sns.distplot(subset.iloc[i][subset.iloc[i] > 1], )
#if yes:
## whole dataframe log transformation
# loggedsubset = np.log2(subset + 1)
# loggedsubset.head()

In [ ]:
# Distribution after log transformation
# for i in range(1,2500,250):
#     sns.distplot(loggedsubset.iloc[i][loggedsubset.iloc[i] > 1], )

For single cell and bulk combined wide mtx and metadata


In [5]:
# Fit metadata to mtx shape

## Index has different data type, so are joined incorrectly...
## Reloading with low_memory option kills the kernel.
## Instead, try turning all index to string before combining (joining)

# Join data subset with metadata, keep the intersected metadata
subsetWmeta = subset.join(metadata)
fitMeta = subsetWmeta[list(metadata.columns)]
print("Metadata with fitted shape of: ", fitMeta.shape)


Metadata with fitted shape of:  (16180, 6)

In [6]:
# Sample ID to meta extractor
dict_sampIDtoMeta = {
    "1": ["0dpa","1"],
    "2": ["0dpa","2"],
    "3": ["1dpa","1"],
    "4": ["1dpa","2"],
    "5": ["2dpa","1"],
    "6": ["2dpa","2"],
    "7": ["4dpa","1"],
    "8": ["4dpa","2"],
                    }

In [ ]:
# Extract bulk data subset and save as a new dataframe
bulkMask = fitMeta["Type"] == "Bulk"
asBulkDF = subset[bulkMask]
# Loop through all possible samples (1-8)
for i in range(1,9):
    # Make mask
    sampID = str(i)
    sampGPMask1 = fitMeta["SampleID"] == i 
    sampGPMask2 = fitMeta["GFP"] == True
    sampGPMask = [all(tup) for tup in zip(sampGPMask1,sampGPMask2)]
    sampGNMask1 = fitMeta["SampleID"] == i 
    sampGNMask2 = fitMeta["GFP"] == False
    sampGNMask = [all(tup) for tup in zip(sampGNMask1,sampGNMask2)]
    # Extract sample subset
    sampGPSubset = subset[sampGPMask]
    sampGNSubset = subset[sampGNMask]
    # Calculate mean for all cells within sample
    sampGPMean = sampGPSubset.mean()
    sampGNMean = sampGNSubset.mean()
    # Rename the mean array
    sampGPName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPpo"
    sampGPMean = sampGPMean.rename(sampGPName)
    sampGNName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPne"
    sampGNMean = sampGNMean.rename(sampGNName)
    # Append calculated mean to bulk matrix
    asBulkDF = asBulkDF.append(sampGPMean)
    asBulkDF = asBulkDF.append(sampGNMean)
    # report the shape of appended matrix
    print("Current matrix shape: ", asBulkDF.shape)
# mean only dataframe?
asBulkDF.head()

In [ ]:
# Adjust all sample total count to 3000?
## Add one column to save the sum of each sample
asBulkDF["SUM"] = asBulkDF.sum(axis=1)
## Transpose for easier calculation
asBulkD = asBulkDF.T
## For every value in frame
normAsBulkDF = asBulkD.divide(asBulkD.iloc[18830] / 3000)
normAsBulkDF

In [ ]:
# Transpose back and drop SUM column
DFforPCA = normAsBulkDF.T.drop(["SUM"],axis=1)
print("Shape of matrix for PCA: ",DFforPCA.shape)
DFforPCA.head()

In [ ]:
# Estimate percentage of variance can be explained by principal components:
sns.set()
pca = PCA().fit(DFforPCA)
plt.figure(figsize=(8,6),dpi=300)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,12)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [ ]:
# Use the number of PCs determined to transform data
pca_tsfed = pca = PCA(n_components=4).fit_transform(DFforPCA)
print(pca_tsfed.shape)
# Plot samples by transformed components
fig, ax = plt.subplots(figsize=(8,6),dpi=300)
ax.scatter(pca_tsfed[:, 1], pca_tsfed[:, 2],
            c=["b","g","r","c","b","g","r","c","m","y","m","y","k","b","k","b","g","r","g","r","c","m","c","m"], edgecolor='none', alpha=0.5)
name = ["B4P","B4N","B0P","B0N","B4P","B4N","B0P","B0N","SC0P","SC0N","SC0P","SC0N","SC1P","SC1N","SC1P","SC1N","SC2P","SC2N","SC2P","SC2N","SC4P","SC4N","SC4P","SC4N"]
#name = DFforPCA.index
for i, txt in enumerate(name):
    ax.annotate(txt, (pca_tsfed[:, 1][i], pca_tsfed[:, 2][i]))
ax.set(xlabel='component 2', ylabel='component 3')

In [ ]:


In [ ]:


In [ ]: