Load and untidying 10x matrix data


In [64]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Get basename of input file
import os
# Statistic model package for Generalized Linear Models
import statsmodels.api as sm
# Stats package for chi-square distribution
from scipy.stats import chi2

# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!

In [65]:
# Assign variables for input file names
inDir = "../AGGallSampWGFP/"
inMtx = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/matrix.mtx"
inGenes = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/genes.tsv"
inBarcodes = inDir+"outs/filtered_gene_bc_matrices_mex/cellRanger_danRer10wReporter/barcodes.tsv"

In [66]:
# Load long format data and convert to wide format
## Read in mtx. The raw data from 10X genomics should work.
mtx = pd.read_table(inMtx,
                    sep=" ", header=None, low_memory=False, skiprows=3)
## Set column names for the input mtx
colName = ["GeneID", "CellID", "GeneCounts"]#,"Barcode","SampleID","GFP","ClusterID"]
mtx.columns = colName
## Convert this into a gene-by-cell matrix. Unwinding long format to wider format
widemtx = mtx.pivot(index='CellID', columns='GeneID', values='GeneCounts')
#widemtx
# Replace NaN with zero
widemtxWzero = widemtx.fillna(value=0)
print(widemtxWzero.shape)
widemtxWzero.head()


(18773, 24122)
Out[66]:
GeneID 1 2 4 5 6 7 8 9 10 11 ... 28956 28957 28958 28960 28961 28962 28963 28964 28965 28980
CellID
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 4.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0

5 rows × 24122 columns


In [ ]:


In [67]:
# Read in the gene list
genes = pd.read_table(inGenes, header=None)
genes.columns = ["EnsemblID","GeneShortName"]
genes.index = list(range(1,28982))
print(genes.head())
# Make a dictionary to store the gene number and GeneShortName assignments
dictGenNumName = {}
for i in range(1,28982):
    dictGenNumName[i] = [genes["EnsemblID"][i],genes["GeneShortName"][i]]
print(dictGenNumName[1])
# Convert GeneID (numbers) into gene names
GeneNames = []
noConvNum = 0
for GeneID in widemtxWzero.columns:
    try:
        GeneName = dictGenNumName[GeneID][0]
    except KeyError:
        GeneName = GeneID
        noConvNum += 1
    GeneNames.append(GeneName)
print(noConvNum,"of GeneIDs not converted.")
print("length of GeneNames: ",len(GeneNames))
# Reassign all lables in the cell by gene matrix (geneID -> geneName)
widemtxWzero.columns = GeneNames
print(widemtxWzero.head())


            EnsemblID       GeneShortName
1  ENSDARG00000104632                RERG
2  ENSDARG00000100660  ENSDARG00000100660
3  ENSDARG00000098417                syn3
4  ENSDARG00000100422               ptpro
5  ENSDARG00000102128                eps8
['ENSDARG00000104632', 'RERG']
0 of GeneIDs not converted.
length of GeneNames:  24122
        ENSDARG00000104632  ENSDARG00000100660  ENSDARG00000100422  \
CellID                                                               
1                      0.0                 0.0                 0.0   
2                      0.0                 0.0                 0.0   
3                      0.0                 0.0                 0.0   
4                      0.0                 0.0                 0.0   
5                      0.0                 0.0                 0.0   

        ENSDARG00000102128  ENSDARG00000103095  ENSDARG00000102226  \
CellID                                                               
1                      0.0                 0.0                 0.0   
2                      0.0                 0.0                 0.0   
3                      0.0                 0.0                 0.0   
4                      0.0                 0.0                 0.0   
5                      0.0                 3.0                 0.0   

        ENSDARG00000104049  ENSDARG00000102474  ENSDARG00000100143  \
CellID                                                               
1                      1.0                 0.0                 0.0   
2                      0.0                 0.0                 0.0   
3                      0.0                 0.0                 0.0   
4                      0.0                 0.0                 0.0   
5                      0.0                 0.0                 0.0   

        ENSDARG00000104839  ...   ENSDARG00000097721  ENSDARG00000051761  \
CellID                      ...                                            
1                      0.0  ...                  0.0                 0.0   
2                      0.0  ...                  0.0                 0.0   
3                      0.0  ...                  0.0                 0.0   
4                      0.0  ...                  0.0                 0.0   
5                      0.0  ...                  0.0                 0.0   

        ENSDARG00000093480  ENSDARG00000089892  ENSDARG00000051756  \
CellID                                                               
1                      0.0                 0.0                 0.0   
2                      0.0                 0.0                 0.0   
3                      0.0                 0.0                 0.0   
4                      0.0                 0.0                 0.0   
5                      0.0                 0.0                 0.0   

        ENSDARG00000102643  ENSDARG00000103220  ENSDARG00000102863  \
CellID                                                               
1                      1.0                 0.0                 0.0   
2                      0.0                 0.0                 0.0   
3                      1.0                 0.0                 0.0   
4                      0.0                 0.0                 0.0   
5                      0.0                 0.0                 0.0   

        ENSDARG00000101560  REP1  
CellID                            
1                      4.0   0.0  
2                      0.0   0.0  
3                      2.0   0.0  
4                      0.0   0.0  
5                      2.0   1.0  

[5 rows x 24122 columns]

In [ ]:


In [73]:
## Add cell barcodes
## Read in cell barcode file
metadata = pd.read_table(inBarcodes, header=None, names=["Barcodes"])
metadata.index = widemtxWzero.index
## Extract sample information from barcodes -> save into a new column
metadata["SampleID"] = [barcode.split("-")[1] for barcode in metadata["Barcodes"]]
## Map sampleID number to Stage/Type/Rep metadata
dict_sampIDtoMeta = {
    "1": ["0dpa","1"],
    "2": ["0dpa","2"],
    "3": ["1dpa","1"],
    "4": ["1dpa","2"],
    "5": ["2dpa","1"],
    "6": ["2dpa","2"],
    "7": ["4dpa","1"],
    "8": ["4dpa","2"],
                    }
metadata["Stage"] = [ dict_sampIDtoMeta[sampID][0] for sampID in metadata["SampleID"]]
metadata["Replicate"] = [ dict_sampIDtoMeta[sampID][1] for sampID in metadata["SampleID"]]
## Add single cell label to dataframe (this is for all samples)
metadata["Type"] = ["SC"] * len(metadata.index)
metadata["GFP"] = widemtxWzero["REP1"] > 0 
## how does our metadata look so far?
print("Current shape of metadata: ", metadata.shape)
print("And it looks like this: \n", metadata.head())
print("There are %d stages, %d types in the dataset." % (len(np.unique(metadata["Stage"])), len(np.unique(metadata["Type"]))))


Current shape of metadata:  (18773, 6)
And it looks like this: 
                   Barcodes SampleID Stage Replicate Type    GFP
CellID                                                         
1       AAACCTGAGAGTCGGT-1        1  0dpa         1   SC  False
2       AAACCTGCAAATCCGT-1        1  0dpa         1   SC  False
3       AAACCTGGTATAATGG-1        1  0dpa         1   SC  False
4       AAACCTGTCAGCTTAG-1        1  0dpa         1   SC  False
5       AAACCTGTCATATCGG-1        1  0dpa         1   SC   True
There are 4 stages, 1 types in the dataset.

In [ ]:

Prior filtering distributions


In [10]:
gCountInCell = (widemtxWzero > 0).sum(axis=1)

In [11]:
sns.distplot(gCountInCell)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f50b1205048>

In [12]:
sns.distplot(widemtxWzero.sum(axis=1))


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f50ae436390>

In [12]:
min(widemtxWzero.sum(axis=1))


Out[12]:
3270.0

Filter out low genes/ low cells!


In [70]:
# From the distribution above, consider rm cells out of 1000-3000 gene captured range
lowMask = (widemtxWzero > 0).sum(axis=1) >= 1000
highMask = (widemtxWzero > 0).sum(axis=1) <= 3000
mask = [all(tup) for tup in zip(lowMask, highMask)]
# There are  falling in this category before gene removal
len(mask)


Out[70]:
18773

In [71]:
# Gene filter
## Calculate number of cells with a specific gene expression. Mask -> filter out genes with number of cells below 10
lowCellExpMask = (widemtxWzero > 0).sum(axis=0) > 10
np.sum(lowCellExpMask)


Out[71]:
18830

In [72]:
## Remove lowly captured genes
expGfilt = widemtxWzero.T[lowCellExpMask]
## Remove cells with low expression profile
expG_CfT = expGfilt.T[mask]
print("Shape of DEG before filtering is: ", widemtxWzero.shape)
print("Shape of DEG after filtering is: ", expG_CfT.shape)
expG_CfT.head()


Shape of DEG before filtering is:  (18773, 24122)
Shape of DEG after filtering is:  (16172, 18830)
Out[72]:
ENSDARG00000100660 ENSDARG00000100422 ENSDARG00000102128 ENSDARG00000103095 ENSDARG00000102226 ENSDARG00000104049 ENSDARG00000102474 ENSDARG00000100143 ENSDARG00000104373 ENSDARG00000098311 ... ENSDARG00000078600 ENSDARG00000097721 ENSDARG00000051761 ENSDARG00000093480 ENSDARG00000089892 ENSDARG00000051756 ENSDARG00000102643 ENSDARG00000103220 ENSDARG00000101560 REP1
CellID
1 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 4.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0

5 rows × 18830 columns


In [ ]:
# Change cellID into "cell#"?
# cellIDlist = ["Cell"+str(i) for i in expG_CfT.index]
# expG_Cfilt = expG_CfT
# expG_Cfilt.index = cellIDlist
# expG_Cfilt.head()

Take in bulk-seq feature counts


In [79]:
# Read in feature count
FCfile = "/scratch/yhou/Fin_Reg/07_SingleCellRNA/public_data/09282016_HJ_GFPFinReg/FCs/092817_FinRegGFPwgeneID.fc"
bulkFC = pd.read_table(FCfile, index_col = 0)
print(bulkFC.shape)
bulkFC.head()


(28981, 8)
Out[79]:
01_4dpaGFPpo_rep5Aligned.sortedByCoord.out.bam 02_4dpaGFPne_rep5Aligned.sortedByCoord.out.bam 03_0dpaGFPpo_rep5Aligned.sortedByCoord.out.bam 04_0dpaGFPne_rep5Aligned.sortedByCoord.out.bam 05_4dpaGFPpo_rep6Aligned.sortedByCoord.out.bam 06_4dpaGFPne_rep6Aligned.sortedByCoord.out.bam 07_0dpaGFPpo_rep6Aligned.sortedByCoord.out.bam 08_0dpaGFPne_rep6Aligned.sortedByCoord.out.bam
Geneid
ENSDARG00000104632 896 107 189 80 897 109 431 106
ENSDARG00000100660 663 824 809 519 482 698 795 677
ENSDARG00000098417 4 2 0 0 3 4 1 0
ENSDARG00000100422 1484 1484 102 58 1321 1147 182 106
ENSDARG00000102128 892 1929 1143 2554 893 2009 1289 2902

In [80]:
# Remove "Aligned.sortedByCoord.out.bam" in sample names
shortSampNames = [name.split("A")[0] for name in bulkFC.columns]
bulkFC.columns = shortSampNames
bulkFC.head()


Out[80]:
01_4dpaGFPpo_rep5 02_4dpaGFPne_rep5 03_0dpaGFPpo_rep5 04_0dpaGFPne_rep5 05_4dpaGFPpo_rep6 06_4dpaGFPne_rep6 07_0dpaGFPpo_rep6 08_0dpaGFPne_rep6
Geneid
ENSDARG00000104632 896 107 189 80 897 109 431 106
ENSDARG00000100660 663 824 809 519 482 698 795 677
ENSDARG00000098417 4 2 0 0 3 4 1 0
ENSDARG00000100422 1484 1484 102 58 1321 1147 182 106
ENSDARG00000102128 892 1929 1143 2554 893 2009 1289 2902

In [21]:
bulkFC.sum(axis=0)


Out[21]:
01_4dpaGFPpo_rep5    47061460
02_4dpaGFPne_rep5    47383189
03_0dpaGFPpo_rep5    27009827
04_0dpaGFPne_rep5    35852577
05_4dpaGFPpo_rep6    35610717
06_4dpaGFPne_rep6    43817832
07_0dpaGFPpo_rep6    26472854
08_0dpaGFPne_rep6    42742471
dtype: int64

In [76]:
np.average(expG_CfT.sum(axis=1))


Out[76]:
8391.5390180558989

In [77]:
sns.distplot(expG_CfT.sum(axis=1))


Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f50ad6e0da0>

In [ ]:
# Combine the two datasets -> with/without normalization
## Transform bulk feature count (so both df will be cell-by-gene)
# combDF = expG_Cfilt.T.join(bulkFC,how = 'left')
# print(combDF.shape)
# combDF.head()
  • Normalize bulk-seq data by total counts captured

In [81]:
# Normalize HJ's bulk by total read counts closer to single cell sets (round to ~20000)
## Append one row with total count to each column, then 
## Create the sum array
colSumFC = pd.DataFrame(bulkFC.sum(axis=0),columns=['Sum'])
colSumFC
## Append array to bulkFC dataframe
bulkFCwSum = bulkFC.append(colSumFC.T)
## Divide all values in each cell/sample by sum.this will normalize bulk data to 10,000 UMIs per sample)
bulkFCnorm = bulkFCwSum.divide(bulkFCwSum.iloc[28981] / 10000)

In [83]:
## Combine thie normalized set back with the singe cell set
## Transform bulk feature count (so both df will be cell-by-gene)
ncombDF = expG_CfT.T.join(bulkFCnorm,how = 'left')
print(ncombDF.shape)
ncombDF.head()


(18830, 16180)
Out[83]:
1 2 3 4 5 6 7 9 10 11 ... 18770 18773 01_4dpaGFPpo_rep5 02_4dpaGFPne_rep5 03_0dpaGFPpo_rep5 04_0dpaGFPne_rep5 05_4dpaGFPpo_rep6 06_4dpaGFPne_rep6 07_0dpaGFPpo_rep6 08_0dpaGFPne_rep6
ENSDARG00000100660 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.140880 0.173901 0.299521 0.144759 0.135353 0.159296 0.300308 0.158390
ENSDARG00000100422 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.315332 0.313191 0.037764 0.016177 0.370956 0.261766 0.068750 0.024800
ENSDARG00000102128 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.189539 0.407106 0.423179 0.712362 0.250767 0.458489 0.486914 0.678950
ENSDARG00000103095 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 1.152110 0.925012 2.966698 1.124884 1.113429 0.844633 2.642707 0.957362
ENSDARG00000102226 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.029323 0.034189 0.009626 0.014504 0.036225 0.039253 0.012088 0.011230

5 rows × 16180 columns


In [84]:
## log transform and add GFP metadata
loggedexpwB = np.log2(ncombDF.T + 1)
loggedexpwB.tail()


Out[84]:
ENSDARG00000100660 ENSDARG00000100422 ENSDARG00000102128 ENSDARG00000103095 ENSDARG00000102226 ENSDARG00000104049 ENSDARG00000102474 ENSDARG00000100143 ENSDARG00000104373 ENSDARG00000098311 ... ENSDARG00000078600 ENSDARG00000097721 ENSDARG00000051761 ENSDARG00000093480 ENSDARG00000089892 ENSDARG00000051756 ENSDARG00000102643 ENSDARG00000103220 ENSDARG00000101560 REP1
04_0dpaGFPne_rep5 0.195044 0.023152 0.775987 1.087384 0.020774 0.260332 1.041203 0.570504 0.143882 0.228744 ... 0.055651 0.056425 0.050220 0.144974 0.050220 0.777866 1.691768 0.444289 1.741714 0.018392
05_4dpaGFPpo_rep6 0.183140 0.455182 0.322813 1.079586 0.051337 0.205447 0.570157 0.759146 0.182069 0.335391 ... 0.333463 0.178494 0.142244 0.312737 0.020515 0.866590 1.358315 0.471928 1.912934 3.027165
06_4dpaGFPne_rep6 0.213249 0.335444 0.544475 0.883334 0.055548 0.167663 0.690760 0.822824 0.163847 0.291733 ... 0.264856 0.162377 0.129045 0.694427 0.108119 0.722833 1.054790 0.271963 1.949547 0.019621
07_0dpaGFPpo_rep6 0.378853 0.095924 0.572321 1.865011 0.017335 0.279984 0.840460 0.798777 0.111645 0.448398 ... 0.077966 0.060304 0.057164 0.183948 0.023782 1.032524 2.639556 1.569517 1.686388 2.015198
08_0dpaGFPne_rep6 0.212122 0.035342 0.747559 0.968910 0.016111 0.216776 0.809536 0.405195 0.125974 0.190988 ... 0.051391 0.048130 0.037974 0.116353 0.101329 0.682170 1.588278 0.400855 1.647333 0.019112

5 rows × 18830 columns

Make metadata for bulk-seq data, combine with sc metadata


In [101]:
## Extract bulk sample names
bulkMeta = pd.DataFrame(data = ["NaN1","NaN2","NaN3","NaN4","NaN5","NaN6","NaN7","NaN8"],
                        index = bulkFC.columns,columns=["Barcodes"])
## Add all required columns as in single cell metadata
bulkMeta["SampleID"] = ["9"] * 8
bulkMeta["Stage"] = ["4dpa"]*2+["0dpa"]*2+["4dpa"]*2+["0dpa"]*2
bulkMeta["Replicate"] = ["5"] * 4 + ["6"] * 4
bulkMeta["Type"] = ["Bulk"] * 8
bulkMeta["GFP"] = ["po" in str(index) for index in bulkMeta.index ]

In [ ]:
combMeta = metadata.append(bulkMeta)
combMeta

In [105]:
## Save wide mtx and the combined metadata to file
#loggedexpwB.to_csv("AGGallSamp_wBulkNorm_logged_widemtx.csv",compression='gzip')
combMeta.to_csv("AGGallSamp_wBulk_metadata.csv")

In [ ]:


In [26]:
# Save combined, log transformed matrix to file
#loggedexpwB.to_csv("samp1_wBulk_filtered_log2_cellgene_widemtx")

In [84]:
## Save data to file
#ncombDF.to_csv("../samp1wGFP/MtxSubsets/samp1_GFPpnbulk_comb_normed_filtered_widemtx.csv")