In [1]:
%matplotlib inline
In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
import statsmodels.api as sm
In [3]:
"""
total_normal_pairs_RRBS_NormalBCD19pcell45_66.csv
total_normal_pairs_RRBS_NormalBCD19pcell67_88.csv
total_normal_pairs_RRBS_NormalBCD19pcell23_44.csv
total_normal_pairs_RRBS_NormalBCD19pcell1_22.csv
total_normal_pairs_NormalBCD19pCD27mcell45_66.csv
total_normal_pairs_NormalBCD19pCD27mcell23_44.csv
total_normal_pairs_NormalBCD19pCD27mcell1_22.csv
total_normal_pairs_Normal_B_cell_A1_24.csv
total_normal_pairs_NormalBCD19pCD27mcell67_88.csv
total_normal_pairs_Normal_B_cell_H1_22.csv
total_normal_pairs_Normal_B_cell_G1_22.csv
total_normal_pairs_Normal_B_cell_D1_24.csv
total_normal_pairs_Normal_B_cell_C1_24.csv
total_normal_pairs_Normal_B_cell_B1_24.csv
total_CLL_pairs.csv
total_normal_pairs.csv
total_CLL_pairs_cw154_CutSmart_proteinase_K.csv
total_CLL_pairs_cw154_Tris_protease_GR.csv
total_CLL_pairs_cw154_Tris_protease.csv
total_CLL_pairs_trito_pool_2.csv
total_CLL_pairs_trito_pool_1.csv
"""
Out[3]:
In [4]:
cll_pairs = pd.read_csv("total_CLL_pairs.csv")
normal_pairs = pd.read_csv("total_normal_pairs.csv")
In [5]:
cll_pairs.shape
Out[5]:
In [6]:
normal_pairs.shape
Out[6]:
In [7]:
cll_pairs.columns
Out[7]:
In [8]:
cll_pairs.head()
Out[8]:
In [9]:
cll_pairs.shape
Out[9]:
In [10]:
cll_pairs.columns
Out[10]:
In [11]:
meth_cll = cll_pairs[['methylation_unweighted_difference',
'methylation_tssDistance_difference',
'methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfUpDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [12]:
sns.boxplot(meth_cll)
Out[12]:
In [13]:
pdr_cll = cll_pairs[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [14]:
pdr_normal = normal_pairs[['PDR_genesDistance_difference', 'PDR_exonsDistance_difference',
'PDR_intronsDistance_difference', 'PDR_promoterDistance_difference',
'PDR_cgiDistance_difference', 'PDR_ctcfDistance_difference', 'PDR_ctcfDownDistance_difference',
'PDR_geneDistalRegulatoryModulesDistance',
'PDR_vistaEnhancersDistance_difference',
'PDR_3PrimeUTRDistance_difference', 'PDR_5PrimeUTRDistance_difference',
'PDR_firstExonDistance_difference',
'PDR_geneDistalRegulatoryModulesK562Distance_difference',
'PDR_hypoInHues64Distance_difference', 'PDR_intergenic_difference',
'PDR_shore_difference', 'PDR_shelf_difference']]
In [15]:
pdr_cll.shape
Out[15]:
In [16]:
pdr_normal.shape
Out[16]:
In [17]:
pdr_all = pd.concat([pdr_cll, pdr_normal])
In [18]:
pdr_all.shape
Out[18]:
In [19]:
sns.boxplot(pdr_all)
Out[19]:
In [20]:
pdr_cll.head()
Out[20]:
In [21]:
sns.boxplot(pdr_cll, orient="h")
plt.xlim(-0.01, 0.55)
Out[21]:
In [22]:
sns.boxplot(pdr_normal, orient="h")
Out[22]:
In [23]:
sns.boxplot(pdr_normal, orient="h")
plt.xlim(-0.01, 0.55)
Out[23]:
In [24]:
pdr_all.columns
Out[24]:
In [25]:
pdr_all.head()
Out[25]:
In [26]:
pdr_cll.head()
Out[26]:
In [27]:
pdr_cll.columns
Out[27]:
In [28]:
pdr_strip = pd.melt(pdr_cll, var_name='genome_region', value_name='PDR_difference')
In [29]:
pdr_strip
Out[29]:
In [30]:
sns.boxplot(x=pdr_strip.genome_region, y=pdr_strip.PDR_difference)
Out[30]:
In [ ]:
In [31]:
sns.boxplot(y=pdr_strip.genome_region, x=pdr_strip.PDR_difference)
plt.title("PDR pair difference, CLL")
Out[31]:
In [58]:
sns.boxplot(y=pdr_strip.genome_region, x=pdr_strip.PDR_difference, showfliers=False)
plt.title("PDR pair difference, CLL")
plt.xlim(-0.01, 0.75)
Out[58]:
In [33]:
normalpdr_strip = pd.melt(pdr_normal, var_name='genome_region', value_name='PDR_difference')
In [59]:
sns.boxplot(y=normalpdr_strip.genome_region, x=normalpdr_strip.PDR_difference, showfliers=False)
plt.title("PDR pair difference, Normal B")
plt.xlim(-0.01, 0.75)
Out[59]:
In [36]:
pdr_strip["category"] = str("CLL")
normalpdr_strip["category"] = str("normal")
In [37]:
total_pdr_pairs = pd.concat([pdr_strip, normalpdr_strip])
In [40]:
total_pdr_pairs.head()
Out[40]:
In [42]:
sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
Out[42]:
In [43]:
sns.violinplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
Out[43]:
In [ ]:
In [52]:
ax = sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category)
sns.set_style("whitegrid")
ax.legend_.remove()
sns.despine(left=True)
ax.set_ylabel('')
ax.set_xlabel('')
plt.xlim(-0.01, 0.7)
Out[52]:
In [61]:
sns.boxplot(y=total_pdr_pairs.genome_region, x=total_pdr_pairs.PDR_difference, hue=total_pdr_pairs.category, showfliers=False)
Out[61]:
In [ ]:
In [68]:
meth_cll = cll_pairs[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [69]:
meth_normal = normal_pairs[['methylation_genesDistance_difference',
'methylation_exonsDistance_difference',
'methylation_intronsDistance_difference',
'methylation_promoterDistance_difference',
'methylation_cgiDistance_difference',
'methylation_ctcfDistance_difference',
'methylation_ctcfDownDistance_difference',
'methylation_geneDistalRegulatoryModulesDistance',
'methylation_vistaEnhancersDistance_difference',
'methylation_3PrimeUTRDistance_difference',
'methylation_5PrimeUTRDistance_difference',
'methylation_firstExonDistance_difference',
'methylation_geneDistalRegulatoryModulesK562Distance_difference',
'methylation_hypoInHues64Distance_difference',
'methylation_intergenic_difference', 'methylation_shore_difference',
'methylation_shelf_difference']]
In [ ]:
In [ ]:
In [ ]:
In [70]:
meth_strip = pd.melt(meth_cll, var_name='genome_region', value_name='meth_difference')
In [71]:
normal_meth_strip = pd.melt(meth_normal, var_name='genome_region', value_name='meth_difference')
In [72]:
sns.boxplot(y=meth_strip.genome_region, x=meth_strip.meth_difference, showfliers=False)
plt.title("Methylation pair difference, CLL")
plt.xlim(-0.01, 0.75)
Out[72]:
In [73]:
sns.boxplot(y=normal_meth_strip.genome_region, x=normal_meth_strip.meth_difference, showfliers=False)
plt.title("Methylation pair difference, Normal B")
plt.xlim(-0.01, 0.75)
Out[73]:
In [74]:
meth_strip["category"] = str("CLL")
normal_meth_strip["category"] = str("normal")
In [76]:
total_meth_pairs = pd.concat([meth_strip, normal_meth_strip])
In [80]:
total_meth_pairs.head()
Out[80]:
In [81]:
sns.boxplot(y=total_meth_pairs.genome_region, x=total_meth_pairs.meth_difference, hue=total_meth_pairs.category, showfliers=False)
plt.title("Methylation pair difference, CLL vs Normal B")
plt.ylabel("Genomic region")
plt.xlabel("Methylation pair difference")
Out[81]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: