In [1]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Import packages for dimension reduction
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Import interactive modules
from ipywidgets import interact,interactive,fixed
from IPython.display import display
# Import stats for Fisher exact p-value calculation
import scipy.stats as stats
In [2]:
# Read in the DGE data sheet
inFile = './AGGallSamp_wBulkNorm_logged_widemtx.csv'
subset = pd.read_table(inFile, sep=',',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0,
# Tells pandas to decompress the gzipped file if required
compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe. Should be cells as indexes, genes as column names
# Transpose by subset = subset.T if required.
print(subset.shape)
subset.head()
Out[2]:
In [3]:
# Should have specified data type for indexes. convert all indexes to string type
subset.index = [str(index) for index in subset.index]
In [4]:
# Read in metadata
metadata = pd.read_table("./AGGallSamp_wBulk_metadata.csv", sep = ',', index_col=0)
print(metadata.shape)
metadata.head()
Out[4]:
In [ ]:
## Transpose matrix
# subset = subset.T
# subset.head()
In [ ]:
# Is the data highly skewed?
## Can be roughly estimated by:
for i in range(1,2500,250):
sns.distplot(subset.iloc[i][subset.iloc[i] > 1], )
#if yes:
## whole dataframe log transformation
# loggedsubset = np.log2(subset + 1)
# loggedsubset.head()
In [ ]:
# Distribution after log transformation
# for i in range(1,2500,250):
# sns.distplot(loggedsubset.iloc[i][loggedsubset.iloc[i] > 1], )
In [5]:
# Fit metadata to mtx shape
## Index has different data type, so are joined incorrectly...
## Reloading with low_memory option kills the kernel.
## Instead, try turning all index to string before combining (joining)
# Join data subset with metadata, keep the intersected metadata
subsetWmeta = subset.join(metadata)
fitMeta = subsetWmeta[list(metadata.columns)]
print("Metadata with fitted shape of: ", fitMeta.shape)
In [6]:
# Sample ID to meta extractor
dict_sampIDtoMeta = {
"1": ["0dpa","1"],
"2": ["0dpa","2"],
"3": ["1dpa","1"],
"4": ["1dpa","2"],
"5": ["2dpa","1"],
"6": ["2dpa","2"],
"7": ["4dpa","1"],
"8": ["4dpa","2"],
}
In [ ]:
# Extract bulk data subset and save as a new dataframe
bulkMask = fitMeta["Type"] == "Bulk"
asBulkDF = subset[bulkMask]
# Loop through all possible samples (1-8)
for i in range(1,9):
# Make mask
sampID = str(i)
sampGPMask1 = fitMeta["SampleID"] == i
sampGPMask2 = fitMeta["GFP"] == True
sampGPMask = [all(tup) for tup in zip(sampGPMask1,sampGPMask2)]
sampGNMask1 = fitMeta["SampleID"] == i
sampGNMask2 = fitMeta["GFP"] == False
sampGNMask = [all(tup) for tup in zip(sampGNMask1,sampGNMask2)]
# Extract sample subset
sampGPSubset = subset[sampGPMask]
sampGNSubset = subset[sampGNMask]
# Calculate mean for all cells within sample
sampGPMean = sampGPSubset.mean()
sampGNMean = sampGNSubset.mean()
# Rename the mean array
sampGPName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPpo"
sampGPMean = sampGPMean.rename(sampGPName)
sampGNName = dict_sampIDtoMeta[sampID][0]+dict_sampIDtoMeta[sampID][1]+"GFPne"
sampGNMean = sampGNMean.rename(sampGNName)
# Append calculated mean to bulk matrix
asBulkDF = asBulkDF.append(sampGPMean)
asBulkDF = asBulkDF.append(sampGNMean)
# report the shape of appended matrix
print("Current matrix shape: ", asBulkDF.shape)
# mean only dataframe?
asBulkDF.head()
In [ ]:
# Adjust all sample total count to 3000?
## Add one column to save the sum of each sample
asBulkDF["SUM"] = asBulkDF.sum(axis=1)
## Transpose for easier calculation
asBulkD = asBulkDF.T
## For every value in frame
normAsBulkDF = asBulkD.divide(asBulkD.iloc[18830] / 3000)
normAsBulkDF
In [ ]:
# Transpose back and drop SUM column
DFforPCA = normAsBulkDF.T.drop(["SUM"],axis=1)
print("Shape of matrix for PCA: ",DFforPCA.shape)
DFforPCA.head()
In [ ]:
# Estimate percentage of variance can be explained by principal components:
sns.set()
pca = PCA().fit(DFforPCA)
plt.figure(figsize=(8,6),dpi=300)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,12)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
In [ ]:
# Use the number of PCs determined to transform data
pca_tsfed = pca = PCA(n_components=4).fit_transform(DFforPCA)
print(pca_tsfed.shape)
# Plot samples by transformed components
fig, ax = plt.subplots(figsize=(8,6),dpi=300)
ax.scatter(pca_tsfed[:, 1], pca_tsfed[:, 2],
c=["b","g","r","c","b","g","r","c","m","y","m","y","k","b","k","b","g","r","g","r","c","m","c","m"], edgecolor='none', alpha=0.5)
name = ["B4P","B4N","B0P","B0N","B4P","B4N","B0P","B0N","SC0P","SC0N","SC0P","SC0N","SC1P","SC1N","SC1P","SC1N","SC2P","SC2N","SC2P","SC2N","SC4P","SC4N","SC4P","SC4N"]
#name = DFforPCA.index
for i, txt in enumerate(name):
ax.annotate(txt, (pca_tsfed[:, 1][i], pca_tsfed[:, 2][i]))
ax.set(xlabel='component 2', ylabel='component 3')
In [ ]:
In [ ]:
In [ ]: