In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [3]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    # these are the 'weighted' results
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_exceptChr13.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")

In [4]:
print(normal_cellA_df.shape)
print(normal_cellB_df.shape)
print(normal_cellC_df.shape)
print(cll_cellA_df.shape)
print(cll_cellC_df.shape)


(126, 7)
(88, 7)
(90, 7)
(66, 7)
(44, 7)

In [5]:
frames1 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
meth_result = pd.concat(frames1)
meth_result.shape


Out[5]:
(304, 7)

In [6]:
meth_result["type"] = str('normal')
meth_result = meth_result.reset_index(drop=True)
normal_result = meth_result

In [7]:
frames2 = [cll_cellA_df, cll_cellC_df]

cll_result = pd.concat(frames2)

In [8]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)
print(cll_cellA_df.shape)
print(cll_cellC_df.shape)
print(cll_result.shape)
cll_result["type"] = str('CLL')
cll_result = cll_result.reset_index(drop=True)


(66, 6)
(44, 6)
(110, 7)

In [9]:
print(normal_result.shape)
print(cll_result.shape)


(304, 8)
(110, 8)

In [10]:
combined = normal_result.append(cll_result)

In [11]:
combined = combined.reset_index(drop=True)

In [12]:
combined.head()


Out[12]:
Unnamed: 0 filename methylation PDR_total thisMeth mixedReadCount total_reads type
0 0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC.dan.a... 0.591346 0.259001 7033858.0 3080732.0 11894660.0 normal
1 1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG.dan.a... 0.531169 0.411448 1989048.0 1540734.0 3744659.0 normal
2 2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG.dan.a... 0.586403 0.278568 6134873.0 2914341.0 10461874.0 normal
3 3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC.dan.a... 0.618746 0.384385 8694.0 5401.0 14051.0 normal
4 4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG.dan.a... 0.628623 0.248006 13784911.0 5438461.0 21928743.0 normal

In [13]:
combined.shape # (414, 8)


Out[13]:
(414, 8)

In [14]:
# Remove all data points with less than 100k in totcpg 
combined = combined[combined['total_reads'] > 100000]

In [15]:
combined.shape  # (358, 8), removed 56 files; filtered 304 normal B to 251 files, 112 CCL to 107 files


Out[15]:
(358, 8)

In [16]:
normal_result = normal_result[normal_result['total_reads'] > 100000]
cll_result = cll_result[cll_result['total_reads'] > 100000]
print(normal_result.shape)
print(cll_result.shape)


(251, 8)
(107, 8)

In [17]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["methylation"])
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[17]:
<matplotlib.text.Text at 0x109898550>

In [18]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["methylation"], jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[18]:
<matplotlib.text.Text at 0x1099f6978>

In [19]:
ax = sns.violinplot(x=combined["type"],  y=combined["methylation"])
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("violin plot features a kernel density estimation of the underlying distribution")
plt.ylim(0.1, 0.85)


violin plot features a kernel density estimation of the underlying distribution
Out[19]:
(0.1, 0.85)

In [20]:
ax = sns.boxplot(x=combined["type"],y=combined["methylation"], linewidth=1.5)
plt.ylim(0.0, 0.85)
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL, chromosomes 2, 5, & 11, corrected")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("Box whisker plot")
plt.ylim(0.1, 0.8)


Box whisker plot
Out[20]:
(0.1, 0.8)

In [21]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlim(0.0, 0.80)
plt.ylabel("Cell Type: Normal cells vs. CLL samples")


Out[21]:
<matplotlib.text.Text at 0x109a46c50>

In [ ]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")


Out[ ]:
<matplotlib.text.Text at 0x150f995f8>

In [ ]:
ax = sns.boxplot(y=combined["type"],  x=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")

In [ ]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")

print("Swarmplot == categorical scatterplot where the points do not overlap")

In [ ]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")

In [ ]:


In [ ]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")

In [ ]:
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)  
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellB_df["type"] = str('normal')
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)
normal_cellB_df["type"] = str('normal')
normal_cellB_df["bio"] = str('CD27m')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)  
normal_cellC_df["type"] = str('normal')
normal_cellC_df["bio"] = str('CD27p')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]

In [ ]:
frames4 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames4)

In [ ]:
normal_result = normal_result[['filename', 'methylation', 'thisMeth', 'mixedReadCount', 'total_reads', 'type', 'bio', 'protocol']]

In [ ]:
normal_result.shape

In [ ]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1) 
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] 
                          != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'

In [ ]:
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1) 
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]

In [ ]:
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
cll_result.shape

In [ ]:
cll_result = cll_result[['filename', 'methylation', 'thisMeth',  'mixedReadCount', 'total_reads', 'type', 'bio', 'protocol']]

In [ ]:
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)

In [ ]:
# Remove all data points with less than 100k in totcpg 
combined2 = combined2[combined2['total_reads'] > 100000]

In [ ]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Percent Methylation, weighted")

In [ ]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
ax = sns.boxplot(x=combined2["type"],y=combined2["methylation"], linewidth=1.5)
plt.ylim(0.15,0.8)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)
plt.ylim(0.15,0.8)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)
plt.title('corrected')

In [ ]:
ax = sns.violinplot(x=combined2["type"],y=combined2["methylation"], palette="Set3")
plt.ylim(0.1,0.85)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"],y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:
ax = sns.violinplot(x=combined2["type"], y=combined2["methylation"], palette="Set1")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

In [ ]:


In [ ]:


In [ ]:


In [ ]: