In [49]:
%matplotlib inline

In [50]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [51]:
"""
total_read_CpGs_normal_B_cellA1H1_withFilter.csv
total_read_CpGs_normal_B_cellA1H1_noFilter.csv
total_read_CpGs_pCD27mcell_withFilter.csv
total_read_CpGs_pCD27mcell_noFilter.csv
total_read_CpGs_pCD27pcell_withFilter.csv
total_read_CpGs_pCD27pcell_noFilter.csv
total_read_CpGs_NormalBCD19pcell_withFilter.csv
total_read_CpGs_NormalBCD19pcell_noFilter.csv
total_read_CpGs_CLL_cw154_withFilter.csv
total_read_CpGs_CLL_cw154_noFilter.csv
total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv

"""


Out[51]:
'\ntotal_read_CpGs_normal_B_cellA1H1_withFilter.csv\ntotal_read_CpGs_normal_B_cellA1H1_noFilter.csv\ntotal_read_CpGs_pCD27mcell_withFilter.csv\ntotal_read_CpGs_pCD27mcell_noFilter.csv\ntotal_read_CpGs_pCD27pcell_withFilter.csv\ntotal_read_CpGs_pCD27pcell_noFilter.csv\ntotal_read_CpGs_NormalBCD19pcell_withFilter.csv\ntotal_read_CpGs_NormalBCD19pcell_noFilter.csv\ntotal_read_CpGs_CLL_cw154_withFilter.csv\ntotal_read_CpGs_CLL_cw154_noFilter.csv\ntotal_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv\n\n'

In [52]:
normal_B_filtered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_withFilter.csv")
normal_B_unfiltered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_noFilter.csv")
CD27pcell_filtered = pd.read_csv("total_read_CpGs_pCD27pcell_withFilter.csv")
CD27pcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27pcell_noFilter.csv")
CD27mcell_filtered = pd.read_csv("total_read_CpGs_pCD27mcell_withFilter.csv")
CD27mcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27mcell_noFilter.csv")
NormalBCD19pcell_filtered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_withFilter.csv")
NormalBCD19pcell_unfiltered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_noFilter.csv")
trito_both = pd.read_csv("total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv")
cll_cw154_filtered = pd.read_csv("total_read_CpGs_CLL_cw154_withFilter.csv")
cll_cw154_unfiltered = pd.read_csv("total_read_CpGs_CLL_cw154_noFilter.csv")

In [53]:
normal_B_filtered = normal_B_filtered.drop(["Unnamed: 0"], axis=1)
normal_B_unfiltered = normal_B_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27pcell_filtered = CD27pcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27pcell_unfiltered = CD27pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27mcell_filtered = CD27mcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27mcell_unfiltered = CD27mcell_unfiltered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_filtered = NormalBCD19pcell_filtered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_unfiltered = NormalBCD19pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
trito_both = trito_both.drop(["Unnamed: 0"], axis=1)
cll_cw154_filtered = cll_cw154_filtered.drop(["Unnamed: 0"], axis=1)
cll_cw154_unfiltered = cll_cw154_unfiltered.drop(["Unnamed: 0"], axis=1)

In [54]:
print(normal_B_filtered.shape)
print(normal_B_unfiltered.shape)
print(CD27pcell_filtered.shape)
print(CD27pcell_unfiltered.shape)
print(CD27mcell_filtered.shape)
print(CD27mcell_unfiltered.shape)
print(NormalBCD19pcell_filtered.shape)
print(NormalBCD19pcell_unfiltered.shape)
print(trito_both.shape)
print(cll_cw154_filtered.shape)
print(cll_cw154_unfiltered.shape)


(113, 3)
(138, 3)
(69, 3)
(91, 3)
(74, 3)
(88, 3)
(83, 3)
(89, 3)
(44, 3)
(60, 3)
(66, 3)

In [55]:
filtered_files = [normal_B_filtered, CD27pcell_filtered, CD27mcell_filtered, NormalBCD19pcell_filtered, trito_both, cll_cw154_filtered]

In [56]:
filtered_all = pd.concat(filtered_files)

In [57]:
unfiltered_files = [normal_B_unfiltered, CD27pcell_unfiltered, CD27mcell_unfiltered, NormalBCD19pcell_unfiltered, trito_both, cll_cw154_unfiltered]

In [58]:
unfiltered_all = pd.concat(unfiltered_files)

In [59]:
print(filtered_all.shape)
print(unfiltered_all.shape)


(443, 3)
(516, 3)

In [60]:
unfiltered_all_pl = unfiltered_all.loc[:,['total_reads']]
unfiltered_all_pl = unfiltered_all_pl.reset_index(drop=True)

In [61]:
unfiltered_all_pl.plot(kind='area')
plt.title('Unfiltered, total reads, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K


Out[61]:
<matplotlib.text.Text at 0x11603d198>

In [62]:
unfiltered_all2 = unfiltered_all.loc[:,['total_unique_CpGs']]
unfiltered_all2 = unfiltered_all2.reset_index(drop=True)

In [63]:
unfiltered_all2.plot(kind='area', color='g')
plt.title('Unfiltered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K


Out[63]:
<matplotlib.text.Text at 0x11636c470>

In [64]:
filtered_all_pl = filtered_all.loc[:,['total_reads']]
filtered_all_pl = filtered_all_pl.reset_index(drop=True)

In [65]:
filtered_all_pl.plot(kind='area')
plt.title('Filtered, total reads, 516 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[65]:
<matplotlib.text.Text at 0x1163a9550>

In [66]:
filtered_all_pl2 = filtered_all.loc[:,['total_unique_CpGs']]
filtered_all_pl2 = filtered_all_pl2.reset_index(drop=True)

In [144]:
filtered_all_pl2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[144]:
<matplotlib.text.Text at 0x11a3670f0>

In [ ]:


In [68]:
unfiltered_all3 = unfiltered_all.loc[:,['total_reads', 'total_unique_CpGs']]
unfiltered_all3 = unfiltered_all3.reset_index(drop=True)

In [69]:
unfiltered_all3.plot(kind='area')
plt.title('Uniltered, total_reads & total unique CpG per cells, 516 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[69]:
<matplotlib.text.Text at 0x11669a9e8>

In [70]:
filtered_all3 = filtered_all.loc[:,['total_reads', 'total_unique_CpGs']]
filtered_all3 = filtered_all3.reset_index(drop=True)

In [71]:
filtered_all3.plot(kind='area')
plt.title('Filtered, total_reads & total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[71]:
<matplotlib.text.Text at 0x1165f10f0>

In [72]:
df1 = pd.read_table('allStats.txt')

df1 = df1.drop('sample.1', axis=1)
df1 = df1.drop('sample.2', axis=1)

In [73]:
df1.head()


Out[73]:
sample class totMeth totSeen avSum avTot rMixed rTot rAv rAvTot ... totAligned totUsed totClipped totMethCpG totSeenCpG totCpG totalReadPairs totalReads alignedReads bsRate
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC scCLL 16200247 26459744 589804.381329 979272 1707218 6898212 63327.391726 248437 ... 7883322 7883322 120491 16374456 26716690 995714 8297596 16595192 11666213 0.980280
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG scCLL 9323253 15679703 411855.757258 694650 1008783 4076392 44693.975166 174308 ... 4679956 4679956 68785 9443072 15850878 705787 4828405 9656810 6822031 0.980081
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC scCLL 13936061 22777935 511479.409928 851529 1417386 5739945 54134.747198 213209 ... 6500492 6500492 108121 14067287 22965785 865744 7084390 14168780 9869725 0.980305
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG scCLL 15308780 24985015 567608.363999 939422 1604313 6515345 59514.352762 235074 ... 7455882 7455882 112957 15482976 25239223 955160 11133907 22267814 11048117 0.980392
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG scCLL 9079622 15723841 365196.872199 624686 1012941 3996623 40262.819815 155636 ... 4548257 4548257 69445 9169815 15859084 634455 4912633 9825266 6881418 0.980256

5 rows × 23 columns


In [74]:
df1 = df1.drop('class', axis=1)
df1 = df1.drop('totMeth', axis=1)
df1 = df1.drop('totSeen', axis=1)
df1 = df1.drop('avSum', axis=1)
df1 = df1.drop('avTot', axis=1)
df1 = df1.drop('rMixed', axis=1)
df1 = df1.drop('rTot', axis=1)
df1 = df1.drop('rAv', axis=1)
df1 = df1.drop('rAvTot', axis=1)
df1 = df1.drop('bed', axis=1)
df1 = df1.drop('methInfoFile', axis=1)
df1 = df1.drop('totReads', axis=1)
df1 = df1.drop('totAligned', axis=1)
df1 = df1.drop('totClipped', axis=1)

df1 = df1.drop('totUsed', axis=1)
df1 = df1.drop('totMethCpG', axis=1)
df1 = df1.drop('totSeenCpG', axis=1)
# df1 = df1.drop('totCpG', axis=1)
df1 = df1.drop('totalReadPairs', axis=1)
df1 = df1.drop('alignedReads', axis=1)
df1 = df1.drop('bsRate', axis=1)

In [75]:
df1.head()


Out[75]:
sample totCpG totalReads
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 995714 16595192
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 705787 9656810
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 865744 14168780
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG 955160 22267814
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 634455 9825266

In [76]:
df1.shape


Out[76]:
(536, 3)

In [77]:
df1 = df1.rename(columns={'sample': 'filename'})

In [78]:
df1.head()


Out[78]:
filename totCpG totalReads
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 995714 16595192
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 705787 9656810
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 865744 14168780
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG 955160 22267814
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 634455 9825266

In [79]:
stats_evan_both_unfiltered = unfiltered_all.merge(df1, on="filename")

In [80]:
stats_evan_both_unfiltered.shape


Out[80]:
(516, 5)

In [81]:
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered.drop('filename', axis=1)

In [82]:
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})

In [83]:
stats_evan_both_unfiltered_plot.columns


Out[83]:
Index(['total_reads', 'total_unique_CpGs', 'allStats.txt totCpG',
       'allStats.txt total reads'],
      dtype='object')

In [84]:
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot2.reset_index(drop=True)

In [85]:
stats_evan_both_unfiltered_plot2.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total reads, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K


Out[85]:
<matplotlib.text.Text at 0x1167a1320>

In [86]:
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot3.reset_index(drop=True)

In [87]:
stats_evan_both_unfiltered_plot3.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total CpG, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K


Out[87]:
<matplotlib.text.Text at 0x1168cbd68>

In [88]:
stats_evan_both_filtered = filtered_all.merge(df1, on="filename")

In [89]:
stats_evan_both_filtered.shape


Out[89]:
(443, 5)

In [90]:
stats_evan_both_filtered_plot = stats_evan_both_filtered.drop('filename', axis=1)

In [91]:
stats_evan_both_filtered_plot = stats_evan_both_filtered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})

In [92]:
stats_evan_both_filtered_plot.columns


Out[92]:
Index(['total_reads', 'total_unique_CpGs', 'allStats.txt totCpG',
       'allStats.txt total reads'],
      dtype='object')

In [93]:
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot2.reset_index(drop=True)

In [94]:
stats_evan_both_filtered_plot2.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, total reads, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[94]:
<matplotlib.text.Text at 0x116f61f28>

In [ ]:


In [95]:
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot3.reset_index(drop=True)

In [96]:
stats_evan_both_filtered_plot3.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, totCpG--total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[96]:
<matplotlib.text.Text at 0x116c27c50>

In [ ]:


In [ ]:
#
#
# The above output is y-axis: sum total # unique CpGs count, x-axis is anno file. 
# 
# It appears you want:  y-axis: # of cells with sum total CpG count, x-axis is range of sum total CpG counts per file.
#

In [100]:
unfiltered_all2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K


Out[100]:
<matplotlib.text.Text at 0x117199fd0>

In [101]:
unfiltered_all2.columns


Out[101]:
Index(['total_unique_CpGs'], dtype='object')

In [108]:
plt.hist(unfiltered_all2['total_unique_CpGs'], bins=15)


Out[108]:
(array([  85.,  128.,  115.,   70.,   47.,   40.,   19.,    3.,    3.,
           2.,    3.,    0.,    0.,    0.,    1.]),
 array([  3.71000000e+02,   1.39864733e+05,   2.79358467e+05,
          4.18852200e+05,   5.58345933e+05,   6.97839667e+05,
          8.37333400e+05,   9.76827133e+05,   1.11632087e+06,
          1.25581460e+06,   1.39530833e+06,   1.53480207e+06,
          1.67429580e+06,   1.81378953e+06,   1.95328327e+06,
          2.09277700e+06]),
 <a list of 15 Patch objects>)

In [125]:
num_bins = 50
# the histogram of the data
x = unfiltered_all2['total_unique_CpGs']
n, bins, patches = plt.hist(x, num_bins, normed=1, facecolor='green', alpha=0.5)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')


Out[125]:
<matplotlib.text.Text at 0x118ae66a0>

In [168]:
num_bins = 75
data = unfiltered_all2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green', alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 516 files')
plt.title(r'Unfiltered sum total unique CpGs per *.anno file')


Out[168]:
<matplotlib.text.Text at 0x11bd368d0>

In [169]:
unfiltered_all_pl[:10]


Out[169]:
total_reads
0 11894660.0
1 3744659.0
2 10461874.0
3 14051.0
4 21928743.0
5 10864882.0
6 18977710.0
7 15806813.0
8 18369519.0
9 4828.0

In [170]:
num_bins = 75
data = unfiltered_all_pl['total_reads']
plt.hist(data, num_bins, alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 516 files')
plt.title(r'Unfiltered total reads per *.anno file')


Out[170]:
<matplotlib.text.Text at 0x11bf774e0>

In [171]:
filtered_all_pl[:10]


Out[171]:
total_reads
0 11894660.0
1 10461874.0
2 21928743.0
3 10864882.0
4 18977710.0
5 15806813.0
6 18369519.0
7 11929945.0
8 22563380.0
9 11977642.0

In [172]:
num_bins = 75
data = filtered_all_pl['total_reads']
plt.hist(data, num_bins )
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 443 files')
plt.title(r'Filtered total reads per *.anno file')


Out[172]:
<matplotlib.text.Text at 0x11c0dae80>

In [173]:
num_bins = 75
data = filtered_all_pl2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green' )
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 443 files')
plt.title(r'Filtered Sum total Unique CpGs per *.anno file')


Out[173]:
<matplotlib.text.Text at 0x11c30e898>

In [ ]:


In [ ]:


In [ ]: