notebook.community

Edit and run



In [49]:

    
%matplotlib inline



In [50]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')



In [51]:

    
"""
total_read_CpGs_normal_B_cellA1H1_withFilter.csv
total_read_CpGs_normal_B_cellA1H1_noFilter.csv
total_read_CpGs_pCD27mcell_withFilter.csv
total_read_CpGs_pCD27mcell_noFilter.csv
total_read_CpGs_pCD27pcell_withFilter.csv
total_read_CpGs_pCD27pcell_noFilter.csv
total_read_CpGs_NormalBCD19pcell_withFilter.csv
total_read_CpGs_NormalBCD19pcell_noFilter.csv
total_read_CpGs_CLL_cw154_withFilter.csv
total_read_CpGs_CLL_cw154_noFilter.csv
total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv

"""









    Out[51]:





'\ntotal_read_CpGs_normal_B_cellA1H1_withFilter.csv\ntotal_read_CpGs_normal_B_cellA1H1_noFilter.csv\ntotal_read_CpGs_pCD27mcell_withFilter.csv\ntotal_read_CpGs_pCD27mcell_noFilter.csv\ntotal_read_CpGs_pCD27pcell_withFilter.csv\ntotal_read_CpGs_pCD27pcell_noFilter.csv\ntotal_read_CpGs_NormalBCD19pcell_withFilter.csv\ntotal_read_CpGs_NormalBCD19pcell_noFilter.csv\ntotal_read_CpGs_CLL_cw154_withFilter.csv\ntotal_read_CpGs_CLL_cw154_noFilter.csv\ntotal_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv\n\n'



In [52]:

    
normal_B_filtered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_withFilter.csv")
normal_B_unfiltered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_noFilter.csv")
CD27pcell_filtered = pd.read_csv("total_read_CpGs_pCD27pcell_withFilter.csv")
CD27pcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27pcell_noFilter.csv")
CD27mcell_filtered = pd.read_csv("total_read_CpGs_pCD27mcell_withFilter.csv")
CD27mcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27mcell_noFilter.csv")
NormalBCD19pcell_filtered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_withFilter.csv")
NormalBCD19pcell_unfiltered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_noFilter.csv")
trito_both = pd.read_csv("total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv")
cll_cw154_filtered = pd.read_csv("total_read_CpGs_CLL_cw154_withFilter.csv")
cll_cw154_unfiltered = pd.read_csv("total_read_CpGs_CLL_cw154_noFilter.csv")



In [53]:

    
normal_B_filtered = normal_B_filtered.drop(["Unnamed: 0"], axis=1)
normal_B_unfiltered = normal_B_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27pcell_filtered = CD27pcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27pcell_unfiltered = CD27pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27mcell_filtered = CD27mcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27mcell_unfiltered = CD27mcell_unfiltered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_filtered = NormalBCD19pcell_filtered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_unfiltered = NormalBCD19pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
trito_both = trito_both.drop(["Unnamed: 0"], axis=1)
cll_cw154_filtered = cll_cw154_filtered.drop(["Unnamed: 0"], axis=1)
cll_cw154_unfiltered = cll_cw154_unfiltered.drop(["Unnamed: 0"], axis=1)



In [54]:

    
print(normal_B_filtered.shape)
print(normal_B_unfiltered.shape)
print(CD27pcell_filtered.shape)
print(CD27pcell_unfiltered.shape)
print(CD27mcell_filtered.shape)
print(CD27mcell_unfiltered.shape)
print(NormalBCD19pcell_filtered.shape)
print(NormalBCD19pcell_unfiltered.shape)
print(trito_both.shape)
print(cll_cw154_filtered.shape)
print(cll_cw154_unfiltered.shape)









    



(113, 3)
(138, 3)
(69, 3)
(91, 3)
(74, 3)
(88, 3)
(83, 3)
(89, 3)
(44, 3)
(60, 3)
(66, 3)



In [55]:

    
filtered_files = [normal_B_filtered, CD27pcell_filtered, CD27mcell_filtered, NormalBCD19pcell_filtered, trito_both, cll_cw154_filtered]



In [56]:

    
filtered_all = pd.concat(filtered_files)



In [57]:

    
unfiltered_files = [normal_B_unfiltered, CD27pcell_unfiltered, CD27mcell_unfiltered, NormalBCD19pcell_unfiltered, trito_both, cll_cw154_unfiltered]



In [58]:

    
unfiltered_all = pd.concat(unfiltered_files)



In [59]:

    
print(filtered_all.shape)
print(unfiltered_all.shape)









    



(443, 3)
(516, 3)



In [60]:

    
unfiltered_all_pl = unfiltered_all.loc[:,['total_reads']]
unfiltered_all_pl = unfiltered_all_pl.reset_index(drop=True)



In [61]:

    
unfiltered_all_pl.plot(kind='area')
plt.title('Unfiltered, total reads, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K









    Out[61]:





<matplotlib.text.Text at 0x11603d198>



In [62]:

    
unfiltered_all2 = unfiltered_all.loc[:,['total_unique_CpGs']]
unfiltered_all2 = unfiltered_all2.reset_index(drop=True)



In [63]:

    
unfiltered_all2.plot(kind='area', color='g')
plt.title('Unfiltered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K









    Out[63]:





<matplotlib.text.Text at 0x11636c470>



In [64]:

    
filtered_all_pl = filtered_all.loc[:,['total_reads']]
filtered_all_pl = filtered_all_pl.reset_index(drop=True)



In [65]:

    
filtered_all_pl.plot(kind='area')
plt.title('Filtered, total reads, 516 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[65]:





<matplotlib.text.Text at 0x1163a9550>



In [66]:

    
filtered_all_pl2 = filtered_all.loc[:,['total_unique_CpGs']]
filtered_all_pl2 = filtered_all_pl2.reset_index(drop=True)



In [144]:

    
filtered_all_pl2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[144]:





<matplotlib.text.Text at 0x11a3670f0>



In [ ]:



In [68]:

    
unfiltered_all3 = unfiltered_all.loc[:,['total_reads', 'total_unique_CpGs']]
unfiltered_all3 = unfiltered_all3.reset_index(drop=True)



In [69]:

    
unfiltered_all3.plot(kind='area')
plt.title('Uniltered, total_reads & total unique CpG per cells, 516 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[69]:





<matplotlib.text.Text at 0x11669a9e8>



In [70]:

    
filtered_all3 = filtered_all.loc[:,['total_reads', 'total_unique_CpGs']]
filtered_all3 = filtered_all3.reset_index(drop=True)



In [71]:

    
filtered_all3.plot(kind='area')
plt.title('Filtered, total_reads & total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[71]:





<matplotlib.text.Text at 0x1165f10f0>



In [72]:

    
df1 = pd.read_table('allStats.txt')

df1 = df1.drop('sample.1', axis=1)
df1 = df1.drop('sample.2', axis=1)



In [73]:

    
df1.head()









    Out[73]:






  
    
      
      sample
      class
      totMeth
      totSeen
      avSum
      avTot
      rMixed
      rTot
      rAv
      rAvTot
      ...
      totAligned
      totUsed
      totClipped
      totMethCpG
      totSeenCpG
      totCpG
      totalReadPairs
      totalReads
      alignedReads
      bsRate
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      scCLL
      16200247
      26459744
      589804.381329
      979272
      1707218
      6898212
      63327.391726
      248437
      ...
      7883322
      7883322
      120491
      16374456
      26716690
      995714
      8297596
      16595192
      11666213
      0.980280
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      scCLL
      9323253
      15679703
      411855.757258
      694650
      1008783
      4076392
      44693.975166
      174308
      ...
      4679956
      4679956
      68785
      9443072
      15850878
      705787
      4828405
      9656810
      6822031
      0.980081
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      scCLL
      13936061
      22777935
      511479.409928
      851529
      1417386
      5739945
      54134.747198
      213209
      ...
      6500492
      6500492
      108121
      14067287
      22965785
      865744
      7084390
      14168780
      9869725
      0.980305
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.AGGATG
      scCLL
      15308780
      24985015
      567608.363999
      939422
      1604313
      6515345
      59514.352762
      235074
      ...
      7455882
      7455882
      112957
      15482976
      25239223
      955160
      11133907
      22267814
      11048117
      0.980392
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      scCLL
      9079622
      15723841
      365196.872199
      624686
      1012941
      3996623
      40262.819815
      155636
      ...
      4548257
      4548257
      69445
      9169815
      15859084
      634455
      4912633
      9825266
      6881418
      0.980256
    
  

5 rows × 23 columns



In [74]:

    
df1 = df1.drop('class', axis=1)
df1 = df1.drop('totMeth', axis=1)
df1 = df1.drop('totSeen', axis=1)
df1 = df1.drop('avSum', axis=1)
df1 = df1.drop('avTot', axis=1)
df1 = df1.drop('rMixed', axis=1)
df1 = df1.drop('rTot', axis=1)
df1 = df1.drop('rAv', axis=1)
df1 = df1.drop('rAvTot', axis=1)
df1 = df1.drop('bed', axis=1)
df1 = df1.drop('methInfoFile', axis=1)
df1 = df1.drop('totReads', axis=1)
df1 = df1.drop('totAligned', axis=1)
df1 = df1.drop('totClipped', axis=1)

df1 = df1.drop('totUsed', axis=1)
df1 = df1.drop('totMethCpG', axis=1)
df1 = df1.drop('totSeenCpG', axis=1)
# df1 = df1.drop('totCpG', axis=1)
df1 = df1.drop('totalReadPairs', axis=1)
df1 = df1.drop('alignedReads', axis=1)
df1 = df1.drop('bsRate', axis=1)



In [75]:

    
df1.head()









    Out[75]:






  
    
      
      sample
      totCpG
      totalReads
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      995714
      16595192
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      705787
      9656810
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      865744
      14168780
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.AGGATG
      955160
      22267814
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      634455
      9825266



In [76]:

    
df1.shape









    Out[76]:





(536, 3)



In [77]:

    
df1 = df1.rename(columns={'sample': 'filename'})



In [78]:

    
df1.head()









    Out[78]:






  
    
      
      filename
      totCpG
      totalReads
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      995714
      16595192
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      705787
      9656810
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      865744
      14168780
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.AGGATG
      955160
      22267814
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      634455
      9825266



In [79]:

    
stats_evan_both_unfiltered = unfiltered_all.merge(df1, on="filename")



In [80]:

    
stats_evan_both_unfiltered.shape









    Out[80]:





(516, 5)



In [81]:

    
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered.drop('filename', axis=1)



In [82]:

    
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})



In [83]:

    
stats_evan_both_unfiltered_plot.columns









    Out[83]:





Index(['total_reads', 'total_unique_CpGs', 'allStats.txt totCpG',
       'allStats.txt total reads'],
      dtype='object')



In [84]:

    
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot2.reset_index(drop=True)



In [85]:

    
stats_evan_both_unfiltered_plot2.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total reads, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K









    Out[85]:





<matplotlib.text.Text at 0x1167a1320>



In [86]:

    
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot3.reset_index(drop=True)



In [87]:

    
stats_evan_both_unfiltered_plot3.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total CpG, 516 *.anno files')
plt.xlabel('516 total files')  # still a few files with reads less than 10K









    Out[87]:





<matplotlib.text.Text at 0x1168cbd68>



In [88]:

    
stats_evan_both_filtered = filtered_all.merge(df1, on="filename")



In [89]:

    
stats_evan_both_filtered.shape









    Out[89]:





(443, 5)



In [90]:

    
stats_evan_both_filtered_plot = stats_evan_both_filtered.drop('filename', axis=1)



In [91]:

    
stats_evan_both_filtered_plot = stats_evan_both_filtered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})



In [92]:

    
stats_evan_both_filtered_plot.columns









    Out[92]:





Index(['total_reads', 'total_unique_CpGs', 'allStats.txt totCpG',
       'allStats.txt total reads'],
      dtype='object')



In [93]:

    
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot2.reset_index(drop=True)



In [94]:

    
stats_evan_both_filtered_plot2.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, total reads, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[94]:





<matplotlib.text.Text at 0x116f61f28>



In [ ]:



In [95]:

    
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot3.reset_index(drop=True)



In [96]:

    
stats_evan_both_filtered_plot3.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, totCpG--total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[96]:





<matplotlib.text.Text at 0x116c27c50>



In [ ]:



In [ ]:

    
#
#
# The above output is y-axis: sum total # unique CpGs count, x-axis is anno file. 
# 
# It appears you want:  y-axis: # of cells with sum total CpG count, x-axis is range of sum total CpG counts per file.
#



In [100]:

    
unfiltered_all2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files')  # still a few files with reads less than 10K









    Out[100]:





<matplotlib.text.Text at 0x117199fd0>



In [101]:

    
unfiltered_all2.columns









    Out[101]:





Index(['total_unique_CpGs'], dtype='object')



In [108]:

    
plt.hist(unfiltered_all2['total_unique_CpGs'], bins=15)









    Out[108]:





(array([  85.,  128.,  115.,   70.,   47.,   40.,   19.,    3.,    3.,
           2.,    3.,    0.,    0.,    0.,    1.]),
 array([  3.71000000e+02,   1.39864733e+05,   2.79358467e+05,
          4.18852200e+05,   5.58345933e+05,   6.97839667e+05,
          8.37333400e+05,   9.76827133e+05,   1.11632087e+06,
          1.25581460e+06,   1.39530833e+06,   1.53480207e+06,
          1.67429580e+06,   1.81378953e+06,   1.95328327e+06,
          2.09277700e+06]),
 <a list of 15 Patch objects>)



In [125]:

    
num_bins = 50
# the histogram of the data
x = unfiltered_all2['total_unique_CpGs']
n, bins, patches = plt.hist(x, num_bins, normed=1, facecolor='green', alpha=0.5)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')









    Out[125]:





<matplotlib.text.Text at 0x118ae66a0>



In [168]:

    
num_bins = 75
data = unfiltered_all2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green', alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 516 files')
plt.title(r'Unfiltered sum total unique CpGs per *.anno file')









    Out[168]:





<matplotlib.text.Text at 0x11bd368d0>



In [169]:

    
unfiltered_all_pl[:10]









    Out[169]:






  
    
      
      total_reads
    
  
  
    
      0
      11894660.0
    
    
      1
      3744659.0
    
    
      2
      10461874.0
    
    
      3
      14051.0
    
    
      4
      21928743.0
    
    
      5
      10864882.0
    
    
      6
      18977710.0
    
    
      7
      15806813.0
    
    
      8
      18369519.0
    
    
      9
      4828.0



In [170]:

    
num_bins = 75
data = unfiltered_all_pl['total_reads']
plt.hist(data, num_bins, alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 516 files')
plt.title(r'Unfiltered total reads per *.anno file')









    Out[170]:





<matplotlib.text.Text at 0x11bf774e0>



In [171]:

    
filtered_all_pl[:10]









    Out[171]:






  
    
      
      total_reads
    
  
  
    
      0
      11894660.0
    
    
      1
      10461874.0
    
    
      2
      21928743.0
    
    
      3
      10864882.0
    
    
      4
      18977710.0
    
    
      5
      15806813.0
    
    
      6
      18369519.0
    
    
      7
      11929945.0
    
    
      8
      22563380.0
    
    
      9
      11977642.0



In [172]:

    
num_bins = 75
data = filtered_all_pl['total_reads']
plt.hist(data, num_bins )
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 443 files')
plt.title(r'Filtered total reads per *.anno file')









    Out[172]:





<matplotlib.text.Text at 0x11c0dae80>



In [173]:

    
num_bins = 75
data = filtered_all_pl2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green' )
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 443 files')
plt.title(r'Filtered Sum total Unique CpGs per *.anno file')









    Out[173]:





<matplotlib.text.Text at 0x11c30e898>



In [ ]:



In [ ]:



In [ ]:

	sample	class	totMeth	totSeen	avSum	avTot	rMixed	rTot	rAv	rAvTot	...	totAligned	totUsed	totClipped	totMethCpG	totSeenCpG	totCpG	totalReadPairs	totalReads	alignedReads	bsRate
0	RRBS_trito_pool_1_TAAGGCGA.ACAACC	scCLL	16200247	26459744	589804.381329	979272	1707218	6898212	63327.391726	248437	...	7883322	7883322	120491	16374456	26716690	995714	8297596	16595192	11666213	0.980280
1	RRBS_trito_pool_1_TAAGGCGA.ACGTGG	scCLL	9323253	15679703	411855.757258	694650	1008783	4076392	44693.975166	174308	...	4679956	4679956	68785	9443072	15850878	705787	4828405	9656810	6822031	0.980081
2	RRBS_trito_pool_1_TAAGGCGA.ACTCAC	scCLL	13936061	22777935	511479.409928	851529	1417386	5739945	54134.747198	213209	...	6500492	6500492	108121	14067287	22965785	865744	7084390	14168780	9869725	0.980305
3	RRBS_trito_pool_1_TAAGGCGA.AGGATG	scCLL	15308780	24985015	567608.363999	939422	1604313	6515345	59514.352762	235074	...	7455882	7455882	112957	15482976	25239223	955160	11133907	22267814	11048117	0.980392
4	RRBS_trito_pool_1_TAAGGCGA.ATAGCG	scCLL	9079622	15723841	365196.872199	624686	1012941	3996623	40262.819815	155636	...	4548257	4548257	69445	9169815	15859084	634455	4912633	9825266	6881418	0.980256

	total_reads
0	11894660.0
1	3744659.0
2	10461874.0
3	14051.0
4	21928743.0
5	10864882.0
6	18977710.0
7	15806813.0
8	18369519.0
9	4828.0