notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')



In [3]:

    
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    # these are the 'weighted' results
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")



In [4]:

    
print(normal_cellA_df.shape)
print(normal_cellB_df.shape)
print(normal_cellC_df.shape)
print(cll_cellA_df.shape)
print(cll_cellC_df.shape)









    



(126, 7)
(90, 7)
(88, 7)
(66, 7)
(44, 7)



In [5]:

    
frames1 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]

normal_result = pd.concat(frames1)



In [6]:

    
normal_result["type"] = str('normal')



In [7]:

    
normal_result = normal_result.reset_index(drop=True)



In [8]:

    
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)  
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)



In [9]:

    
frames2 = [cll_cellA_df, cll_cellC_df]

cll_result = pd.concat(frames2)



In [10]:

    
cll_result.shape









    Out[10]:





(110, 6)



In [11]:

    
cll_result["type"] = str('CLL')



In [12]:

    
cll_result = cll_result.reset_index(drop=True)



In [13]:

    
print(normal_result.shape)
print(cll_result.shape)









    



(304, 8)
(110, 7)



In [14]:

    
combined = normal_result.append(cll_result)



In [15]:

    
combined.head()









    Out[15]:






  
    
      
      PDR_total
      Unnamed: 0
      filename
      methylation
      mixedReadCount
      thisMeth
      total_reads
      type
    
  
  
    
      0
      0.259001
      0.0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC.dan.a...
      0.591346
      3080732.0
      7033858.0
      11894660.0
      normal
    
    
      1
      0.411448
      1.0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG.dan.a...
      0.531169
      1540734.0
      1989048.0
      3744659.0
      normal
    
    
      2
      0.278568
      2.0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG.dan.a...
      0.586403
      2914341.0
      6134873.0
      10461874.0
      normal
    
    
      3
      0.384385
      3.0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC.dan.a...
      0.618746
      5401.0
      8694.0
      14051.0
      normal
    
    
      4
      0.248006
      4.0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG.dan.a...
      0.628623
      5438461.0
      13784911.0
      21928743.0
      normal



In [16]:

    
combined.shape









    Out[16]:





(414, 8)



In [17]:

    
# Remove all data points with less than 100k in totcpg 
combined = combined[combined['total_reads'] > 100000]



In [18]:

    
combined.shape  # removed 56 files









    Out[18]:





(358, 8)



In [19]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"])
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")









    Out[19]:





<matplotlib.text.Text at 0x107b367f0>



In [20]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"], jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")









    Out[20]:





<matplotlib.text.Text at 0x107d210b8>



In [21]:

    
# find out identity of CLL outlier


cll_result = cll_result[cll_result['total_reads'] > 100000]
PDRlessthan30 = cll_result[cll_result["PDR_total"]<0.30]



In [22]:

    
PDRlessthan30  # Ask Dan about this
# Possibly "you are sampling random chromosomes, please avoid 13, which has a deletion in cw154"









    Out[22]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
    
  
  
    
      18
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTG...
      0.492474
      0.288357
      65008.0
      38064.0
      132003.0
      CLL



In [23]:

    
ax = sns.violinplot(x=combined["type"], y=combined["PDR_total"])
sns.plt.title("Violin plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("violin plot features a kernel density estimation of the underlying distribution")









    



violin plot features a kernel density estimation of the underlying distribution



In [24]:

    
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
plt.ylim(0.0, 0.65)
sns.plt.title("Violin plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("Box whisker plot")









    



Box whisker plot



In [25]:

    
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
sns.plt.title("Horizontal Box-Whisker Plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlim(0.0, 0.70)
plt.ylabel("Cell Type: Normal cells vs. CLL samples")









    Out[25]:





<matplotlib.text.Text at 0x14fcc5240>



In [26]:

    
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")









    Out[26]:





<matplotlib.text.Text at 0x14fca8828>



In [27]:

    
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
ax = sns.stripplot(y=combined["type"], x=combined["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlabel("PDR")









    Out[27]:





<matplotlib.text.Text at 0x14fec69b0>



In [28]:

    
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
ax = sns.swarmplot(x=combined["type"], y=combined["PDR_total"], color=".25", linewidth=1.5)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("Swarmplot == categorical scatterplot where the points do not overlap")









    



Swarmplot == categorical scatterplot where the points do not overlap



In [29]:

    
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["PDR_total"], color=".25", linewidth=1.5)
sns.plt.title("Swarmplot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")









    Out[29]:





<matplotlib.text.Text at 0x14ff5f898>



In [ ]:



In [30]:

    
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    # these are the 'weighted' results
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")



In [31]:

    
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)  
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)  
normal_cellB_df["type"] = str('normal')
normal_cellB_df["bio"] = str('CD27p')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)  
normal_cellC_df["type"] = str('normal')
normal_cellC_df["bio"] = str('CD27m')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]



In [32]:

    
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
normal_result = normal_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol']]



In [33]:

    
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1) 
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1) 
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
cll_result = cll_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol']]
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)
print(combined2.shape)









    



(414, 6)






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [34]:

    
# Remove all data points with less than 100k in totcpg 
combined2 = combined2[combined2['total_reads'] > 100000]



In [35]:

    
print(combined2.shape)



In [36]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[36]:





<matplotlib.legend.Legend at 0x14fb70ba8>



In [37]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[37]:





<matplotlib.legend.Legend at 0x14fda4f60>



In [38]:

    
ax = sns.boxplot(x=combined2["type"], y=combined2["PDR_total"], linewidth=1.5)
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[38]:





<matplotlib.legend.Legend at 0x15c122f98>



In [39]:

    
combined2.boxplot(column = 'PDR_total', by='type', fontsize=13)
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[39]:





<matplotlib.legend.Legend at 0x16a32ccf8>



In [40]:

    
combined2.boxplot(column = 'PDR_total', by='type', fontsize=13)









    Out[40]:





<matplotlib.axes._subplots.AxesSubplot at 0x15c0f97f0>



In [41]:

    
ax = sns.violinplot(x=combined2["type"], y=combined2["PDR_total"], palette="Set3")
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[41]:





<matplotlib.legend.Legend at 0x16c13a400>



In [42]:

    
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True)
sns.plt.title("Technical/protocol sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[42]:





<matplotlib.legend.Legend at 0x17bfb7128>



In [43]:

    
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True, linewidth=0.5)
sns.plt.title("Technical/protocol sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[43]:





<matplotlib.legend.Legend at 0x1a3aedba8>



In [44]:

    
ax = sns.violinplot(x=combined2["type"], y=combined2["PDR_total"], palette="Set1")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[44]:





<matplotlib.legend.Legend at 0x1a3ddfe10>



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	PDR_total	Unnamed: 0	filename	methylation	mixedReadCount	thisMeth	total_reads	type
0	0.259001	0.0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC.dan.a...	0.591346	3080732.0	7033858.0	11894660.0	normal
1	0.411448	1.0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG.dan.a...	0.531169	1540734.0	1989048.0	3744659.0	normal
2	0.278568	2.0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG.dan.a...	0.586403	2914341.0	6134873.0	10461874.0	normal
3	0.384385	3.0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC.dan.a...	0.618746	5401.0	8694.0	14051.0	normal
4	0.248006	4.0	RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG.dan.a...	0.628623	5438461.0	13784911.0	21928743.0	normal