In [64]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Get basename of input file
import os
# Statistic model package for Generalized Linear Models
import statsmodels.api as sm
# Stats package for chi-square distribution
from scipy.stats import chi2
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!
In [65]:
# Assign variables for input file names
inDir = "../AGGallSampWGFP/"
inMtx = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/matrix.mtx"
inGenes = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/genes.tsv"
inBarcodes = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/barcodes.tsv"
In [66]:
# Load long format data and convert to wide format
## Read in mtx. The raw data from 10X genomics should work.
mtx = pd.read_table(inMtx,
sep=" ", header=None, low_memory=False, skiprows=3)
## Set column names for the input mtx
colName = ["GeneID", "CellID", "GeneCounts"]#,"Barcode","SampleID","GFP","ClusterID"]
mtx.columns = colName
## Convert this into a gene-by-cell matrix. Unwinding long format to wider format
widemtx = mtx.pivot(index='CellID', columns='GeneID', values='GeneCounts')
#widemtx
# Replace NaN with zero
widemtxWzero = widemtx.fillna(value=0)
print(widemtxWzero.shape)
widemtxWzero.head()
Out[66]:
In [ ]:
In [67]:
# Read in the gene list
genes = pd.read_table(inGenes, header=None)
genes.columns = ["EnsemblID","GeneShortName"]
genes.index = list(range(1,28982))
print(genes.head())
# Make a dictionary to store the gene number and GeneShortName assignments
dictGenNumName = {}
for i in range(1,28982):
dictGenNumName[i] = [genes["EnsemblID"][i],genes["GeneShortName"][i]]
print(dictGenNumName[1])
# Convert GeneID (numbers) into gene names
GeneNames = []
noConvNum = 0
for GeneID in widemtxWzero.columns:
try:
GeneName = dictGenNumName[GeneID][0]
except KeyError:
GeneName = GeneID
noConvNum += 1
GeneNames.append(GeneName)
print(noConvNum,"of GeneIDs not converted.")
print("length of GeneNames: ",len(GeneNames))
# Reassign all lables in the cell by gene matrix (geneID -> geneName)
widemtxWzero.columns = GeneNames
print(widemtxWzero.head())
In [ ]:
In [73]:
## Add cell barcodes
## Read in cell barcode file
metadata = pd.read_table(inBarcodes, header=None, names=["Barcodes"])
metadata.index = widemtxWzero.index
## Extract sample information from barcodes -> save into a new column
metadata["SampleID"] = [barcode.split("-")[1] for barcode in metadata["Barcodes"]]
## Map sampleID number to Stage/Type/Rep metadata
dict_sampIDtoMeta = {
"1": ["0dpa","1"],
"2": ["0dpa","2"],
"3": ["1dpa","1"],
"4": ["1dpa","2"],
"5": ["2dpa","1"],
"6": ["2dpa","2"],
"7": ["4dpa","1"],
"8": ["4dpa","2"],
}
metadata["Stage"] = [ dict_sampIDtoMeta[sampID][0] for sampID in metadata["SampleID"]]
metadata["Replicate"] = [ dict_sampIDtoMeta[sampID][1] for sampID in metadata["SampleID"]]
## Add single cell label to dataframe (this is for all samples)
metadata["Type"] = ["SC"] * len(metadata.index)
metadata["GFP"] = widemtxWzero["REP1"] > 0
## how does our metadata look so far?
print("Current shape of metadata: ", metadata.shape)
print("And it looks like this: \n", metadata.head())
print("There are %d stages, %d types in the dataset." % (len(np.unique(metadata["Stage"])), len(np.unique(metadata["Type"]))))
In [ ]:
In [10]:
gCountInCell = (widemtxWzero > 0).sum(axis=1)
In [11]:
sns.distplot(gCountInCell)
Out[11]:
In [12]:
sns.distplot(widemtxWzero.sum(axis=1))
Out[12]:
In [12]:
min(widemtxWzero.sum(axis=1))
Out[12]:
In [70]:
# From the distribution above, consider rm cells out of 1000-3000 gene captured range
lowMask = (widemtxWzero > 0).sum(axis=1) >= 1000
highMask = (widemtxWzero > 0).sum(axis=1) <= 3000
mask = [all(tup) for tup in zip(lowMask, highMask)]
# There are falling in this category before gene removal
len(mask)
Out[70]:
In [71]:
# Gene filter
## Calculate number of cells with a specific gene expression. Mask -> filter out genes with number of cells below 10
lowCellExpMask = (widemtxWzero > 0).sum(axis=0) > 10
np.sum(lowCellExpMask)
Out[71]:
In [72]:
## Remove lowly captured genes
expGfilt = widemtxWzero.T[lowCellExpMask]
## Remove cells with low expression profile
expG_CfT = expGfilt.T[mask]
print("Shape of DEG before filtering is: ", widemtxWzero.shape)
print("Shape of DEG after filtering is: ", expG_CfT.shape)
expG_CfT.head()
Out[72]:
In [ ]:
# Change cellID into "cell#"?
# cellIDlist = ["Cell"+str(i) for i in expG_CfT.index]
# expG_Cfilt = expG_CfT
# expG_Cfilt.index = cellIDlist
# expG_Cfilt.head()
In [79]:
# Read in feature count
FCfile = "/scratch/yhou/Fin_Reg/07_SingleCellRNA/public_data/09282016_HJ_GFPFinReg/FCs/092817_FinRegGFPwgeneID.fc"
bulkFC = pd.read_table(FCfile, index_col = 0)
print(bulkFC.shape)
bulkFC.head()
Out[79]:
In [80]:
# Remove "Aligned.sortedByCoord.out.bam" in sample names
shortSampNames = [name.split("A")[0] for name in bulkFC.columns]
bulkFC.columns = shortSampNames
bulkFC.head()
Out[80]:
In [21]:
bulkFC.sum(axis=0)
Out[21]:
In [76]:
np.average(expG_CfT.sum(axis=1))
Out[76]:
In [77]:
sns.distplot(expG_CfT.sum(axis=1))
Out[77]:
In [ ]:
# Combine the two datasets -> with/without normalization
## Transform bulk feature count (so both df will be cell-by-gene)
# combDF = expG_Cfilt.T.join(bulkFC,how = 'left')
# print(combDF.shape)
# combDF.head()
In [81]:
# Normalize HJ's bulk by total read counts closer to single cell sets (round to ~20000)
## Append one row with total count to each column, then
## Create the sum array
colSumFC = pd.DataFrame(bulkFC.sum(axis=0),columns=['Sum'])
colSumFC
## Append array to bulkFC dataframe
bulkFCwSum = bulkFC.append(colSumFC.T)
## Divide all values in each cell/sample by sum.this will normalize bulk data to 10,000 UMIs per sample)
bulkFCnorm = bulkFCwSum.divide(bulkFCwSum.iloc[28981] / 10000)
In [83]:
## Combine thie normalized set back with the singe cell set
## Transform bulk feature count (so both df will be cell-by-gene)
ncombDF = expG_CfT.T.join(bulkFCnorm,how = 'left')
print(ncombDF.shape)
ncombDF.head()
Out[83]:
In [84]:
## log transform and add GFP metadata
loggedexpwB = np.log2(ncombDF.T + 1)
loggedexpwB.tail()
Out[84]:
In [101]:
## Extract bulk sample names
bulkMeta = pd.DataFrame(data = ["NaN1","NaN2","NaN3","NaN4","NaN5","NaN6","NaN7","NaN8"],
index = bulkFC.columns,columns=["Barcodes"])
## Add all required columns as in single cell metadata
bulkMeta["SampleID"] = ["9"] * 8
bulkMeta["Stage"] = ["4dpa"]*2+["0dpa"]*2+["4dpa"]*2+["0dpa"]*2
bulkMeta["Replicate"] = ["5"] * 4 + ["6"] * 4
bulkMeta["Type"] = ["Bulk"] * 8
bulkMeta["GFP"] = ["po" in str(index) for index in bulkMeta.index ]
In [ ]:
combMeta = metadata.append(bulkMeta)
combMeta
In [105]:
## Save wide mtx and the combined metadata to file
#loggedexpwB.to_csv("AGGallSamp_wBulkNorm_logged_widemtx.csv",compression='gzip')
combMeta.to_csv("AGGallSamp_wBulk_metadata.csv")
In [ ]:
In [26]:
# Save combined, log transformed matrix to file
#loggedexpwB.to_csv("samp1_wBulk_filtered_log2_cellgene_widemtx")
In [84]:
## Save data to file
#ncombDF.to_csv("../samp1wGFP/MtxSubsets/samp1_GFPpnbulk_comb_normed_filtered_widemtx.csv")