In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [3]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    # these are the 'weighted' results
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")

In [4]:
print(normal_cellA_df.shape)
print(normal_cellB_df.shape)
print(normal_cellC_df.shape)
print(cll_cellA_df.shape)
print(cll_cellC_df.shape)


(126, 7)
(90, 7)
(88, 7)
(66, 7)
(44, 7)

In [5]:
frames1 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]

normal_result = pd.concat(frames1)

In [6]:
normal_result["type"] = str('normal')

In [7]:
normal_result = normal_result.reset_index(drop=True)

In [8]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)  
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)

In [9]:
frames2 = [cll_cellA_df, cll_cellC_df]

cll_result = pd.concat(frames2)

In [10]:
cll_result.shape


Out[10]:
(110, 6)

In [11]:
cll_result["type"] = str('CLL')

In [12]:
cll_result = cll_result.reset_index(drop=True)

In [13]:
print(normal_result.shape)
print(cll_result.shape)


(304, 8)
(110, 7)

In [14]:
combined = normal_result.append(cll_result)

In [15]:
combined.head()


Out[15]:
PDR_total Unnamed: 0 filename methylation mixedReadCount thisMeth total_reads type
0 0.259001 0.0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC.dan.a... 0.591346 3080732.0 7033858.0 11894660.0 normal
1 0.411448 1.0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG.dan.a... 0.531169 1540734.0 1989048.0 3744659.0 normal
2 0.278568 2.0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG.dan.a... 0.586403 2914341.0 6134873.0 10461874.0 normal
3 0.384385 3.0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC.dan.a... 0.618746 5401.0 8694.0 14051.0 normal
4 0.248006 4.0 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG.dan.a... 0.628623 5438461.0 13784911.0 21928743.0 normal

In [16]:
combined.shape


Out[16]:
(414, 8)

In [17]:
# Remove all data points with less than 100k in totcpg 
combined = combined[combined['total_reads'] > 100000]

In [18]:
combined.shape  # removed 56 files


Out[18]:
(358, 8)

In [19]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"])
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")


Out[19]:
<matplotlib.text.Text at 0x107b367f0>

In [20]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"], jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")


Out[20]:
<matplotlib.text.Text at 0x107d210b8>

In [21]:
# find out identity of CLL outlier


cll_result = cll_result[cll_result['total_reads'] > 100000]
PDRlessthan30 = cll_result[cll_result["PDR_total"]<0.30]

In [22]:
PDRlessthan30  # Ask Dan about this
# Possibly "you are sampling random chromosomes, please avoid 13, which has a deletion in cw154"


Out[22]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type
18 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTG... 0.492474 0.288357 65008.0 38064.0 132003.0 CLL

In [23]:
ax = sns.violinplot(x=combined["type"], y=combined["PDR_total"])
sns.plt.title("Violin plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("violin plot features a kernel density estimation of the underlying distribution")


violin plot features a kernel density estimation of the underlying distribution

In [24]:
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
plt.ylim(0.0, 0.65)
sns.plt.title("Violin plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("Box whisker plot")


Box whisker plot

In [25]:
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
sns.plt.title("Horizontal Box-Whisker Plot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlim(0.0, 0.70)
plt.ylabel("Cell Type: Normal cells vs. CLL samples")


Out[25]:
<matplotlib.text.Text at 0x14fcc5240>

In [26]:
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=combined["type"], y=combined["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")


Out[26]:
<matplotlib.text.Text at 0x14fca8828>

In [27]:
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
ax = sns.stripplot(y=combined["type"], x=combined["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.xlabel("PDR")


Out[27]:
<matplotlib.text.Text at 0x14fec69b0>

In [28]:
ax = sns.boxplot(x=combined["type"], y=combined["PDR_total"], linewidth=1.5)
ax = sns.swarmplot(x=combined["type"], y=combined["PDR_total"], color=".25", linewidth=1.5)
sns.plt.title("Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
print("Swarmplot == categorical scatterplot where the points do not overlap")


Swarmplot == categorical scatterplot where the points do not overlap

In [29]:
ax = sns.boxplot(y=combined["type"], x=combined["PDR_total"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["PDR_total"], color=".25", linewidth=1.5)
sns.plt.title("Swarmplot: Total PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")


Out[29]:
<matplotlib.text.Text at 0x14ff5f898>

In [ ]:


In [30]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")    # these are the 'weighted' results
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")

In [31]:
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)  
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)  
normal_cellB_df["type"] = str('normal')
normal_cellB_df["bio"] = str('CD27p')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)  
normal_cellC_df["type"] = str('normal')
normal_cellC_df["bio"] = str('CD27m')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]

In [32]:
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
normal_result = normal_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol']]

In [33]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1) 
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1) 
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
cll_result = cll_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol']]
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)
print(combined2.shape)


(414, 6)
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [34]:
# Remove all data points with less than 100k in totcpg 
combined2 = combined2[combined2['total_reads'] > 100000]

In [35]:
print(combined2.shape)


(358, 6)

In [36]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[36]:
<matplotlib.legend.Legend at 0x14fb70ba8>

In [37]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[37]:
<matplotlib.legend.Legend at 0x14fda4f60>

In [38]:
ax = sns.boxplot(x=combined2["type"], y=combined2["PDR_total"], linewidth=1.5)
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[38]:
<matplotlib.legend.Legend at 0x15c122f98>

In [39]:
combined2.boxplot(column = 'PDR_total', by='type', fontsize=13)
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[39]:
<matplotlib.legend.Legend at 0x16a32ccf8>

In [40]:
combined2.boxplot(column = 'PDR_total', by='type', fontsize=13)


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x15c0f97f0>

In [41]:
ax = sns.violinplot(x=combined2["type"], y=combined2["PDR_total"], palette="Set3")
plt.ylim(0,0.7)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[41]:
<matplotlib.legend.Legend at 0x16c13a400>

In [42]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True)
sns.plt.title("Technical/protocol sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[42]:
<matplotlib.legend.Legend at 0x17bfb7128>

In [43]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True, linewidth=0.5)
sns.plt.title("Technical/protocol sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[43]:
<matplotlib.legend.Legend at 0x1a3aedba8>

In [44]:
ax = sns.violinplot(x=combined2["type"], y=combined2["PDR_total"], palette="Set1")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["PDR_total"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
plt.xlabel("Normal cells = 251 *.anno files; CLL cells = 107 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[44]:
<matplotlib.legend.Legend at 0x1a3ddfe10>

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: