In [1]:
%matplotlib inline
In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
import statsmodels.api as sm
In [3]:
"""
total_normal_pairs_RRBS_NormalBCD19pcell45_66.csv
total_normal_pairs_RRBS_NormalBCD19pcell67_88.csv
total_normal_pairs_RRBS_NormalBCD19pcell23_44.csv
total_normal_pairs_RRBS_NormalBCD19pcell1_22.csv
total_normal_pairs_NormalBCD19pCD27mcell45_66.csv
total_normal_pairs_NormalBCD19pCD27mcell23_44.csv
total_normal_pairs_NormalBCD19pCD27mcell1_22.csv
total_normal_pairs_Normal_B_cell_A1_24.csv
total_normal_pairs_NormalBCD19pCD27mcell67_88.csv
total_normal_pairs_Normal_B_cell_H1_22.csv
total_normal_pairs_Normal_B_cell_G1_22.csv
total_normal_pairs_Normal_B_cell_D1_24.csv
total_normal_pairs_Normal_B_cell_C1_24.csv
total_normal_pairs_Normal_B_cell_B1_24.csv
total_CLL_pairs.csv
total_normal_pairs.csv
total_CLL_pairs_cw154_CutSmart_proteinase_K.csv
total_CLL_pairs_cw154_Tris_protease_GR.csv
total_CLL_pairs_cw154_Tris_protease.csv
total_CLL_pairs_trito_pool_2.csv
total_CLL_pairs_trito_pool_1.csv
"""
Out[3]:
In [4]:
cll_pairs = pd.read_csv("total_CLL_pairs.csv")
normal_pairs = pd.read_csv("total_normal_pairs.csv")
In [5]:
cll_pairs.shape
Out[5]:
In [6]:
normal_pairs.shape
Out[6]:
In [7]:
cll_pairs.columns
Out[7]:
In [8]:
cll_pairs.head()
Out[8]:
In [9]:
cll_pairs.shape
Out[9]:
In [10]:
cll_pairs.columns
Out[10]:
In [11]:
meth_cll = cll_pairs[['methylation_unweighted_difference',
'methylation_tssDistance_difference',
'methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfUpDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [12]:
sns.boxplot(meth_cll)
Out[12]:
In [13]:
pdr_cll = cll_pairs[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [14]:
pdr_normal = normal_pairs[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [15]:
pdr_cll.shape
Out[15]:
In [16]:
pdr_normal.shape
Out[16]:
In [17]:
pdr_all = pd.concat([pdr_cll, pdr_normal])
In [18]:
pdr_all.shape
Out[18]:
In [19]:
sns.boxplot(pdr_all)
Out[19]:
In [20]:
pdr_cll.head()
Out[20]:
In [21]:
sns.boxplot(pdr_cll, orient="h")
plt.xlim(-0.01, 0.55)
Out[21]:
In [22]:
sns.boxplot(pdr_normal, orient="h")
Out[22]:
In [23]:
sns.boxplot(pdr_normal, orient="h")
plt.xlim(-0.01, 0.55)
Out[23]:
In [24]:
pdr_all.columns
Out[24]:
In [25]:
pdr_all.head()
Out[25]:
In [26]:
pdr_cll.head()
Out[26]:
In [27]:
pdr_cll.columns
Out[27]:
In [28]:
pdr_strip = pd.melt(pdr_cll, var_name='genome_region', value_name='PDR_difference')
In [29]:
pdr_strip
Out[29]:
In [30]:
sns.boxplot(x=pdr_strip.genome_region, y=pdr_strip.PDR_difference)
Out[30]:
In [ ]:
In [31]:
sns.boxplot(y=pdr_strip.genome_region, x=pdr_strip.PDR_difference)
plt.title("PDR pair difference, CLL")
Out[31]:
In [32]:
sns.boxplot(y=pdr_strip.genome_region, x=pdr_strip.PDR_difference, showfliers=False)
plt.title("PDR pair difference, CLL")
plt.xlim(-0.01, 0.75)
Out[32]:
In [33]:
normalpdr_strip = pd.melt(pdr_normal, var_name='genome_region', value_name='PDR_difference')
In [34]:
sns.boxplot(y=normalpdr_strip.genome_region, x=normalpdr_strip.PDR_difference, showfliers=False)
plt.title("PDR pair difference, Normal B")
Out[34]:
In [35]:
pdr_strip["category"] = str("CLL")
normalpdr_strip["category"] = str("normal")
In [36]:
total_pdr_pairs = pd.concat([pdr_strip, normalpdr_strip])
In [37]:
total_pdr_pairs.head()
Out[37]:
In [38]:
sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
Out[38]:
In [39]:
sns.violinplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
Out[39]:
In [ ]:
In [40]:
ax = sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
sns.set_style("whitegrid")
ax.legend_.remove()
sns.despine(left=True)
ax.set_ylabel('')
ax.set_xlabel('')
plt.xlim(-0.01, 0.7)
Out[40]:
In [41]:
sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category, showfliers=False)
Out[41]:
In [ ]:
In [42]:
meth_cll = cll_pairs[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [43]:
meth_normal = normal_pairs[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [ ]:
In [ ]:
In [ ]:
In [44]:
meth_strip = pd.melt(meth_cll, var_name='genome_region', value_name='meth_difference')
In [45]:
normal_meth_strip = pd.melt(meth_normal, var_name='genome_region', value_name='meth_difference')
In [46]:
sns.boxplot(y=meth_strip.genome_region, x=meth_strip.meth_difference, showfliers=False)
plt.title("Methylation pair difference, CLL")
plt.xlim(-0.01, 0.75)
Out[46]:
In [47]:
sns.boxplot(y=normal_meth_strip.genome_region, x=normal_meth_strip.meth_difference, showfliers=False)
plt.title("Methylation pair difference, Normal B")
plt.xlim(-0.01, 0.4)
Out[47]:
In [48]:
meth_strip["category"] = str("CLL")
normal_meth_strip["category"] = str("normal")
In [49]:
total_meth_pairs = pd.concat([meth_strip, normal_meth_strip])
In [50]:
total_meth_pairs.head()
Out[50]:
In [51]:
sns.boxplot(y=total_meth_pairs.genome_region, x=total_meth_pairs.meth_difference, hue=total_meth_pairs.category, showfliers=False)
plt.title("Methylation pair difference, CLL vs Normal B")
plt.ylabel("Genomic region")
plt.xlabel("Methylation pair difference")
Out[51]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [52]:
#
# Batch effects
#
In [53]:
cll_pairs = pd.read_csv("total_CLL_pairs.csv")
normal_pairs = pd.read_csv("total_normal_pairs.csv")
In [54]:
cll_p1 = pd.read_csv("total_CLL_pairs_cw154_CutSmart_proteinase_K.csv")
cll_p2 = pd.read_csv("total_CLL_pairs_cw154_Tris_protease_GR.csv")
cll_p3 = pd.read_csv("total_CLL_pairs_cw154_Tris_protease.csv")
cll_p4 = pd.read_csv("total_CLL_pairs_trito_pool_2.csv")
cll_p5 = pd.read_csv("total_CLL_pairs_trito_pool_1.csv")
In [55]:
cll_p1.shape
Out[55]:
In [56]:
cll_p2.shape
Out[56]:
In [57]:
cll_p3.shape
Out[57]:
In [58]:
cll_p4.shape
Out[58]:
In [59]:
cll_p5.shape
Out[59]:
In [60]:
cll_p1.columns
Out[60]:
In [61]:
cll_p1_pairs_meth = cll_p1[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [62]:
cll_p1_pairs_pdr = cll_p1[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [63]:
cll_p1_pairs_meth.head()
Out[63]:
In [64]:
cll_p1_pairs_pdr.head()
Out[64]:
In [65]:
meth_strip1 = pd.melt(cll_p1_pairs_meth, var_name='genome_region', value_name='meth_difference')
In [66]:
cll_strip1 = pd.melt(cll_p1_pairs_pdr, var_name='genome_region', value_name='meth_difference')
In [67]:
sns.boxplot(y=meth_strip1.genome_region, x=meth_strip1.meth_difference, showfliers=False)
plt.title("Methylation pair difference, CLL cw154_CutSmart_proteinase_K")
plt.xlim(-0.01, 0.75)
Out[67]:
In [68]:
sns.boxplot(y=cll_strip1.genome_region, x=cll_strip1.meth_difference, showfliers=False)
plt.title("PDR pair difference, CLL cw154_CutSmart_proteinase_K")
plt.xlim(-0.01, 0.75)
Out[68]:
In [ ]:
In [ ]:
In [69]:
cll_p5_pairs_meth = cll_p5[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [70]:
cll_p5_pairs_pdr = cll_p5[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [71]:
meth_strip5 = pd.melt(cll_p5_pairs_meth, var_name='genome_region', value_name='meth_difference')
In [72]:
cll_strip5 = pd.melt(cll_p5_pairs_pdr, var_name='genome_region', value_name='meth_difference')
In [73]:
sns.boxplot(y=meth_strip5.genome_region, x=meth_strip5.meth_difference, showfliers=False)
plt.title("Methylation pair difference, CLL trito_pool1")
plt.xlim(-0.01, 0.75)
Out[73]:
In [74]:
sns.boxplot(y=cll_strip5.genome_region, x=cll_strip5.meth_difference, showfliers=False)
plt.title("PDR pair difference, CLL trito_pool1")
plt.xlim(-0.01, 0.75)
Out[74]:
In [ ]:
In [ ]:
In [75]:
normal_p1 = pd.read_csv("total_normal_pairs_Normal_B_cell_A1_24.csv")
In [76]:
normalA_meth = normal_p1[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [77]:
normalA_pdr = normal_p1[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [78]:
meth_normalA = pd.melt(normalA_meth, var_name='genome_region', value_name='meth_difference')
In [79]:
pdr_normalA = pd.melt(normalA_pdr, var_name='genome_region', value_name='meth_difference')
In [80]:
sns.boxplot(y=meth_normalA.genome_region, x=meth_normalA.meth_difference, showfliers=False)
plt.title("Methylation pair difference, Normal_B_cell_A1")
plt.xlim(-0.01, 0.75)
Out[80]:
In [81]:
sns.boxplot(y=pdr_normalA.genome_region, x=pdr_normalA.meth_difference, showfliers=False)
plt.title("PDR pair difference, Normal_B_cell_A1")
plt.xlim(-0.01, 0.75)
Out[81]:
In [ ]:
In [82]:
mcell = pd.read_csv("total_normal_pairs_NormalBCD19pCD27mcell1_22.csv")
In [83]:
mcell22_meth = mcell[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [84]:
mcell22_pdr = mcell[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [85]:
meth_mcell22 = pd.melt(mcell22_meth, var_name='genome_region', value_name='meth_difference')
In [86]:
pdr_mcell22 = pd.melt(mcell22_pdr, var_name='genome_region', value_name='meth_difference')
In [87]:
sns.boxplot(y=meth_mcell22.genome_region, x=meth_mcell22.meth_difference, showfliers=False)
plt.title("Methylation pair difference, mcell22")
plt.xlim(-0.01, 0.75)
Out[87]:
In [88]:
sns.boxplot(y=pdr_mcell22.genome_region, x=pdr_mcell22.meth_difference, showfliers=False)
plt.title("PDR pair difference, mcell22")
plt.xlim(-0.01, 0.75)
Out[88]:
In [ ]:
In [89]:
mcell234 = pd.read_csv("total_normal_pairs_NormalBCD19pCD27mcell23_44.csv")
In [90]:
mcell234_meth = mcell234[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [91]:
mcell234_pdr = mcell234[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [92]:
meth_mcell234 = pd.melt(mcell234_meth, var_name='genome_region', value_name='meth_difference')
In [93]:
pdr_mcell234 = pd.melt(mcell234_pdr, var_name='genome_region', value_name='meth_difference')
In [94]:
sns.boxplot(y=meth_mcell234.genome_region, x=meth_mcell234.meth_difference, showfliers=False)
plt.title("Methylation pair difference, mcell23_44")
plt.xlim(-0.01, 0.75)
Out[94]:
In [95]:
sns.boxplot(y=pdr_mcell234.genome_region, x=pdr_mcell234.meth_difference, showfliers=False)
plt.title("PDR pair difference, mcell23_44")
plt.xlim(-0.01, 0.75)
Out[95]:
In [ ]:
In [ ]:
In [ ]: