In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [3]:
#
# CLL_cells_PDR_total_chr2_5_11A.csv
# CLL_cells_PDR_total_chr2_5_11B.csv
# CLL_cells_PDR_total_chr2_5_11_tritoC.csv
# Normal_cells_PDR_total_chr2_5_11A.csv
# Normal_cells_PDR_total_chr2_5_11B.csv
# Normal_cells_PDR_total_chr2_5_11C_mcells.csv
#

In [4]:
"""
RRBS_NormalBCD19pCD27pcell_CpGs.csv
RRBS_NormalBCD19pCD27mcell_CpGs.csv
Meth_PDR_cell_RRBS_normal_B1_CpGs.csv
Meth_PDR_cell_RRBS_trito_pool_CpGs.csv
CLL_RRBS_cw154_A_CpGs.csv
"""


Out[4]:
'\nRRBS_NormalBCD19pCD27pcell_CpGs.csv\nRRBS_NormalBCD19pCD27mcell_CpGs.csv\nMeth_PDR_cell_RRBS_normal_B1_CpGs.csv\nMeth_PDR_cell_RRBS_trito_pool_CpGs.csv\nCLL_RRBS_cw154_A_CpGs.csv\n'

In [5]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1.csv")
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normal_mcell.csv")

In [ ]:


In [6]:
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)

In [7]:
normal_cellA_df["type"] = str('normal')

In [8]:
normal_cellA_df["bio"] = str('normal_B')

In [9]:
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]

In [10]:
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]

In [11]:
normal_cellA_df.head()


Out[11]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type bio protocol
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.584639 0.271471 1142525.0 530519.0 1954240.0 normal normal_B normal_B_cell_A1_24
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.524969 0.435593 318935.0 264636.0 607531.0 normal normal_B normal_B_cell_A1_24
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.560134 0.280063 978219.0 489103.0 1746403.0 normal normal_B normal_B_cell_A1_24
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC 0.613036 0.423883 1345.0 930.0 2194.0 normal normal_B normal_B_cell_A1_24
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.607458 0.253306 2177284.0 907915.0 3584255.0 normal normal_B normal_B_cell_A1_24

In [12]:
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')

In [13]:
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)

In [14]:
cpg1["filename"] = cpg1["filename"].str[:40]

In [15]:
cpg1.head()


Out[15]:
filename avgReadCpGs_median avgReadCpGs_mean
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 4.0 5.295301
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 4.0 5.285714
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 5.0 5.453122
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC 4.0 4.950166
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 5.0 5.366276

In [16]:
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')

In [17]:
normal_cellB_df["type"] = str('normal')

In [18]:
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)

In [19]:
normal_cellB_df.head()


Out[19]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type
0 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC... 0.538553 0.332004 519881.0 320493.0 965329.0 normal
1 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG... 0.398948 0.274622 61736.0 42497.0 154747.0 normal
2 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG... 0.496259 0.314214 199.0 126.0 401.0 normal
3 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC... 0.527652 0.252527 534097.0 255611.0 1012214.0 normal
4 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG... 0.419489 0.305627 221463.0 161351.0 527935.0 normal

In [20]:
normal_cellB_df["bio"] = str('CD27p')

In [21]:
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]

In [22]:
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]

In [23]:
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')

In [24]:
normal_cellB_df.head()


Out[24]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type bio protocol
0 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC 0.538553 0.332004 519881.0 320493.0 965329.0 normal CD27p NormalBCD19pCD27pcell1_22_
1 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG 0.398948 0.274622 61736.0 42497.0 154747.0 normal CD27p NormalBCD19pCD27pcell1_22_
2 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG 0.496259 0.314214 199.0 126.0 401.0 normal CD27p NormalBCD19pCD27pcell1_22_
3 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC 0.527652 0.252527 534097.0 255611.0 1012214.0 normal CD27p NormalBCD19pCD27pcell1_22_
4 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG 0.419489 0.305627 221463.0 161351.0 527935.0 normal CD27p NormalBCD19pCD27pcell1_22_

In [25]:
normal_cellB_df.tail()


Out[25]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type bio protocol
85 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.GTTGAG 0.512097 0.319441 513330.0 320210.0 1002408.0 normal CD27p NormalBCD19pCD27pcell67_88
86 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TAGCGG 0.492260 0.307343 277618.0 173331.0 563966.0 normal CD27p NormalBCD19pCD27pcell67_88
87 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TATCTC 0.497594 0.368790 574159.0 425536.0 1153870.0 normal CD27p NormalBCD19pCD27pcell67_88
88 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TCTCTG 0.523298 0.339229 415992.0 269668.0 794943.0 normal CD27p NormalBCD19pCD27pcell67_88
89 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TGCTGC 0.478998 0.177315 4071.0 1507.0 8499.0 normal CD27p NormalBCD19pCD27pcell67_88

In [26]:
cpg2 = pd.read_csv('RRBS_NormalBCD19pCD27pcell_CpGs.csv')

In [27]:
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)

In [28]:
cpg2["filename"] = cpg2["filename"].str[:50]

In [29]:
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')

In [30]:
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')

In [31]:
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)

In [32]:
normal_cellC_df["type"] = str('normal')

In [33]:
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]

In [34]:
normal_cellC_df["bio"] = str('CD27m')

In [35]:
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]

In [36]:
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')

In [ ]:


In [37]:
cpg3 = pd.read_csv("RRBS_NormalBCD19pCD27mcell_CpGs.csv")

In [38]:
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)

In [39]:
cpg3["filename"] = cpg3["filename"].str[:50]

In [40]:
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')

In [41]:
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')

In [42]:
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]

normal_result = pd.concat(frames3)

In [43]:
normal_result.shape


Out[43]:
(304, 11)

In [44]:
normal_result.columns


Out[44]:
Index(['PDR_total', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation', 'mixedReadCount', 'protocol', 'thisMeth',
       'total_reads', 'type'],
      dtype='object')

In [45]:
normal_result = normal_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]

In [46]:
normal_result.head()


Out[46]:
filename PDR_total total_reads type bio protocol avgReadCpGs_mean avgReadCpGs_median
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.271471 1954240.0 normal normal_B normal_B_cell_A1_24 5.295301 4.0
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.435593 607531.0 normal normal_B normal_B_cell_A1_24 5.285714 4.0
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.280063 1746403.0 normal normal_B normal_B_cell_A1_24 5.453122 5.0
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC 0.423883 2194.0 normal normal_B normal_B_cell_A1_24 4.950166 4.0
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.253306 3584255.0 normal normal_B normal_B_cell_A1_24 5.366276 5.0

In [47]:
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_A.csv")
# cll_cellB_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_NormalB_CLL_B.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C.csv")

In [ ]:


In [48]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)

In [49]:
cll_cellA_df["type"] = str('CLL')

In [50]:
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]

In [51]:
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [52]:
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [53]:
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [ ]:


In [54]:
cll_cellA_df["bio"] = str('CLL')

In [55]:
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]

In [56]:
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')

In [ ]:


In [57]:
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')

In [58]:
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)

In [59]:
cpg4["filename"] = cpg4["filename"].str[:51]

In [60]:
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')

In [61]:
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')

In [ ]:


In [62]:
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)

In [63]:
cll_cellC_df["type"] = str('CLL')

In [64]:
cll_cellC_df["bio"] = str('CLL')

In [65]:
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]

In [66]:
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]

In [67]:
cll_cellC_df.head()


Out[67]:
filename methylation PDR_total thisMeth mixedReadCount total_reads type bio protocol
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.582926 0.368337 2007240.0 1268327.0 3443386.0 CLL CLL trito_pool_1
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.563719 0.378028 1147787.0 769701.0 2036096.0 CLL CLL trito_pool_1
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.579104 0.372167 1774353.0 1140306.0 3063962.0 CLL CLL trito_pool_1
3 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.544310 0.384963 1148407.0 812211.0 2109840.0 CLL CLL trito_pool_1
4 RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.572090 0.378931 1944098.0 1287699.0 3398241.0 CLL CLL trito_pool_1

In [ ]:


In [68]:
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')

In [69]:
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1)

In [70]:
cpg5["filename"] = cpg5["filename"].str[:33]

In [71]:
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')

In [ ]:


In [72]:
frames2 = [cll_cellA_df, cll_cellC_df]

cll_result = pd.concat(frames2)

In [73]:
cll_result.shape


Out[73]:
(110, 11)

In [74]:
cll_result.head()


Out[74]:
PDR_total avgReadCpGs_mean avgReadCpGs_median bio filename methylation mixedReadCount protocol thisMeth total_reads type
0 0.391815 5.407442 4.0 CLL RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC 0.572234 721478.0 cw154_CutSmart_proteinase_K 1053697.0 1841373.0 CLL
1 0.401942 5.341039 4.0 CLL RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG 0.511848 130878.0 cw154_CutSmart_proteinase_K 166665.0 325614.0 CLL
2 0.417094 5.367599 4.0 CLL RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG 0.531822 457775.0 cw154_CutSmart_proteinase_K 583693.0 1097534.0 CLL
3 0.391057 5.340834 4.0 CLL RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC 0.579616 698909.0 cw154_CutSmart_proteinase_K 1035906.0 1787229.0 CLL
4 0.389722 5.459580 5.0 CLL RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG 0.571888 822828.0 cw154_CutSmart_proteinase_K 1207438.0 2111320.0 CLL

In [75]:
cll_result.columns


Out[75]:
Index(['PDR_total', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation', 'mixedReadCount', 'protocol', 'thisMeth',
       'total_reads', 'type'],
      dtype='object')

In [76]:
cll_result = cll_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]

In [77]:
cll_result = cll_result.reset_index(drop=True)

In [78]:
normal_result = normal_result.reset_index(drop=True)

In [79]:
combined2 = normal_result.append(cll_result)

In [80]:
combined2 = combined2.reset_index(drop=True)

In [81]:
combined2.head()


Out[81]:
filename PDR_total total_reads type bio protocol avgReadCpGs_mean avgReadCpGs_median
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.271471 1954240.0 normal normal_B normal_B_cell_A1_24 5.295301 4.0
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.435593 607531.0 normal normal_B normal_B_cell_A1_24 5.285714 4.0
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.280063 1746403.0 normal normal_B normal_B_cell_A1_24 5.453122 5.0
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC 0.423883 2194.0 normal normal_B normal_B_cell_A1_24 4.950166 4.0
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.253306 3584255.0 normal normal_B normal_B_cell_A1_24 5.366276 5.0

In [ ]:


In [82]:
# Variables
#
# totCpG ---- Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# bsRate ---- BS rate per cell
#  - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting PDR?

In [ ]:


In [83]:
bs = pd.read_table('allStats.txt')

bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)

In [84]:
bs = bs.reset_index(drop=True)

In [85]:
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)

bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)

In [86]:
bs.head()


Out[86]:
sample totCpG bsRate
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 995714 0.980280
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 705787 0.980081
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 865744 0.980305
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG 955160 0.980392
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 634455 0.980256

In [87]:
bs = bs.rename(columns = {'sample':'filename'})

In [ ]:


In [88]:
merged = pd.merge(combined2, bs, how='inner')

In [89]:
merged = merged.reset_index(drop=True)

In [90]:
merged = merged.rename(columns = {'PDR_total':'PDR'})

In [91]:
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})

In [92]:
merged.head()


Out[92]:
filename PDR total_reads type bio protocol avgReadCpGs_mean avgReadCpGs_median Unique_CpGs bsRate
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.271471 1954240.0 normal normal_B normal_B_cell_A1_24 5.295301 4.0 178825 0.959657
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.435593 607531.0 normal normal_B normal_B_cell_A1_24 5.285714 4.0 86434 0.958634
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.280063 1746403.0 normal normal_B normal_B_cell_A1_24 5.453122 5.0 186115 0.958881
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC 0.423883 2194.0 normal normal_B normal_B_cell_A1_24 4.950166 4.0 6950 0.958175
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.253306 3584255.0 normal normal_B normal_B_cell_A1_24 5.366276 5.0 289150 0.958404

In [93]:
# Remove all data points with less than 100k in totcpg 

merged = merged[merged['total_reads'] > 100000]

In [94]:
scattermatrix1 = merged.drop(['filename', 'total_reads', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)

In [95]:
scattermatrix1.head()


Out[95]:
PDR bio avgReadCpGs_mean Unique_CpGs bsRate
0 0.271471 normal_B 5.295301 178825 0.959657
1 0.435593 normal_B 5.285714 86434 0.958634
2 0.280063 normal_B 5.453122 186115 0.958881
4 0.253306 normal_B 5.366276 289150 0.958404
5 0.443717 normal_B 5.269581 169511 0.959474

In [96]:
#sns.lmplot(x="bsRate", y="PDR", row="bio", col="Unique_CpGs", data=scattermatrix1)

In [97]:
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix1)


Out[97]:
<seaborn.axisgrid.FacetGrid at 0x1078a0cc0>

In [ ]:


In [ ]:


In [ ]:


In [98]:
sns.pairplot(scattermatrix1, hue='bio')
plt.title('Scatterplot, bio')


Out[98]:
<matplotlib.text.Text at 0x10801c240>

In [99]:
scattermatrix2 = merged.drop(['filename', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)

In [100]:
scattermatrix2.head()


Out[100]:
PDR total_reads bio avgReadCpGs_mean Unique_CpGs bsRate
0 0.271471 1954240.0 normal_B 5.295301 178825 0.959657
1 0.435593 607531.0 normal_B 5.285714 86434 0.958634
2 0.280063 1746403.0 normal_B 5.453122 186115 0.958881
4 0.253306 3584255.0 normal_B 5.366276 289150 0.958404
5 0.443717 1863914.0 normal_B 5.269581 169511 0.959474

In [101]:
sns.pairplot(scattermatrix2, hue='bio')


Out[101]:
<seaborn.axisgrid.PairGrid at 0x103858ba8>

In [ ]:


In [ ]:


In [228]:
merged.head()


Out[228]:
filename PDR total_reads type bio protocol avgReadCpGs_mean avgReadCpGs_median Unique_CpGs bsRate
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.271471 1954240.0 normal normal_B normal_B_cell_A1_24 5.295301 4.0 178825 0.959657
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.435593 607531.0 normal normal_B normal_B_cell_A1_24 5.285714 4.0 86434 0.958634
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.280063 1746403.0 normal normal_B normal_B_cell_A1_24 5.453122 5.0 186115 0.958881
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.253306 3584255.0 normal normal_B normal_B_cell_A1_24 5.366276 5.0 289150 0.958404
5 RRBS_normal_B_cell_A1_24_TAAGGCGA.ATAGCG 0.443717 1863914.0 normal normal_B normal_B_cell_A1_24 5.269581 4.0 169511 0.959474

In [229]:
scattermatrix3 = merged.drop(['filename', 'total_reads', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)

In [230]:
sns.pairplot(scattermatrix3, hue='type', palette='Set2')


Out[230]:
<seaborn.axisgrid.PairGrid at 0x11477c668>

In [231]:
scattermatrix4 = merged.drop(['filename', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)

In [232]:
sns.pairplot(scattermatrix4, hue='type', palette='Set1')


Out[232]:
<seaborn.axisgrid.PairGrid at 0x115778da0>

In [233]:
scattermatrix5 = merged.drop(['filename','total_reads', 'bio', 'type', 'avgReadCpGs_median'], axis=1)

In [234]:
sns.pairplot(scattermatrix5, hue='protocol')


Out[234]:
<seaborn.axisgrid.PairGrid at 0x1147918d0>

In [235]:
scattermatrix6 = merged.drop(['filename', 'bio', 'type', 'avgReadCpGs_median'], axis=1)

In [236]:
sns.pairplot(scattermatrix6, hue='protocol')


Out[236]:
<seaborn.axisgrid.PairGrid at 0x11633c1d0>

In [237]:
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)

In [238]:
scattermatrix7.columns


Out[238]:
Index(['PDR', 'total_reads', 'type', 'protocol', 'avgReadCpGs_mean',
       'Unique_CpGs', 'bsRate'],
      dtype='object')

In [239]:
y = scattermatrix7.PDR # dependent variable

In [ ]:


In [240]:
y.shape


Out[240]:
(357,)

In [241]:
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)

In [242]:
X.shape


Out[242]:
(357, 4)

In [243]:
X.head()


Out[243]:
type avgReadCpGs_mean Unique_CpGs bsRate
0 normal 5.295301 178825 0.959657
1 normal 5.285714 86434 0.958634
2 normal 5.453122 186115 0.958881
4 normal 5.366276 289150 0.958404
5 normal 5.269581 169511 0.959474

In [244]:
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [245]:
X.head()


Out[245]:
avgReadCpGs_mean Unique_CpGs bsRate type_CLL type_normal
0 5.295301 178825 0.959657 0.0 1.0
1 5.285714 86434 0.958634 0.0 1.0
2 5.453122 186115 0.958881 0.0 1.0
4 5.366276 289150 0.958404 0.0 1.0
5 5.269581 169511 0.959474 0.0 1.0

In [246]:
# only use type_CLL
X_CLL = X.drop("type_normal", axis=1)

X_CLL.head()


Out[246]:
avgReadCpGs_mean Unique_CpGs bsRate type_CLL
0 5.295301 178825 0.959657 0.0
1 5.285714 86434 0.958634 0.0
2 5.453122 186115 0.958881 0.0
4 5.366276 289150 0.958404 0.0
5 5.269581 169511 0.959474 0.0

In [247]:
from sklearn import linear_model
clf = linear_model.LinearRegression()

clf.fit(X,y)


Out[247]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [248]:
X.columns


Out[248]:
Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')

In [249]:
print(clf.coef_)


[ -1.16707912e-01   7.76340513e-09  -3.17294437e+00   3.63312411e-02
  -3.63312411e-02]

In [250]:
clf.score(X,y) # Returns the coefficient of determination R^2 of the prediction.


Out[250]:
0.60925536710711237

In [251]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X, y)[1])


p-values are: 
[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30
   8.25952531e-30]

In [252]:
clf = linear_model.LinearRegression()

clf.fit(X_CLL,y)  # it's either CLL 0 or CLL 1


Out[252]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [253]:
X_CLL.columns


Out[253]:
Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL'], dtype='object')

In [254]:
print(clf.coef_)


[ -1.16707912e-01   7.76340513e-09  -3.17294437e+00   7.26624822e-02]

In [255]:
clf.score(X_CLL,y)


Out[255]:
0.60925536710711237

In [256]:
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X_CLL, y)[1])


p-values are: 
[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30]

In [257]:
# Ridge

clf = linear_model.BayesianRidge()
clf.fit(X,y)


Out[257]:
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [126]:
print(clf.coef_)


[ -1.16514657e-01   8.17383759e-09  -3.12193603e+00   3.66234875e-02
  -3.66234875e-02]

In [127]:
clf.score(X,y)


Out[127]:
0.60918342000874781

In [128]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])


[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30
   8.25952531e-30]

In [129]:
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)


Out[129]:
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [130]:
print(clf.coef_)   # why zero coefficients?


[ -0.00000000e+00   1.00643698e-07  -0.00000000e+00   0.00000000e+00
  -0.00000000e+00]

In [131]:
clf.score(X,y)     # Why is this so terrible?


Out[131]:
0.078159204902939372

In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)


Out[132]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=True, random_state=36,
           verbose=0, warm_start=False)

In [133]:
X.columns


Out[133]:
Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')

In [134]:
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: avgReadCpGs_mean, Unique_CpGs, bsRate, type")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Out[134]:
<matplotlib.text.Text at 0x112548358>

In [135]:
model.score(X,y)


Out[135]:
0.94122141182319152

In [136]:
scattermatrix7.columns


Out[136]:
Index(['PDR', 'total_reads', 'type', 'protocol', 'avgReadCpGs_mean',
       'Unique_CpGs', 'bsRate'],
      dtype='object')

In [137]:
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")


Out[137]:
<matplotlib.text.Text at 0x112598320>

In [138]:
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")


Out[138]:
<matplotlib.text.Text at 0x112615198>

In [ ]:


In [139]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")


Out[139]:
<matplotlib.text.Text at 0x11271d780>

In [140]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")


Out[140]:
<matplotlib.text.Text at 0x11282a5f8>

In [ ]:


In [ ]:


In [ ]:


In [141]:
sns.jointplot(x="bsRate", y="PDR",  data=scattermatrix7, kind="reg")
plt.title("PDR vs. bsRate, jointplot")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[141]:
<matplotlib.text.Text at 0x112953438>

In [142]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, kind="reg")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[142]:
<seaborn.axisgrid.JointGrid at 0x112a7f630>

In [143]:
sns.jointplot(x="Unique_CpGs", y="PDR",  data=scattermatrix7, kind="reg")
plt.title("Unique CpGs vs PDR")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[143]:
<matplotlib.text.Text at 0x112db9b70>

In [ ]:


In [144]:
type_cll = scattermatrix7[scattermatrix7.type == 'CLL']
type_normal = scattermatrix7[scattermatrix7.type == 'normal']

In [145]:
sns.jointplot(x="bsRate", y="PDR",  data=type_cll, kind="reg", color='r')
plt.title("PDR vs. bsRate, CLL samples, jointplot")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[145]:
<matplotlib.text.Text at 0x112fec550>

In [146]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=type_cll, kind="reg", color='r')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[146]:
<seaborn.axisgrid.JointGrid at 0x112cc4d30>

In [147]:
sns.jointplot(x="Unique_CpGs", y="PDR",  data=type_cll, kind="reg", color='r')
plt.title("Unique CpGs vs PDR")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[147]:
<matplotlib.text.Text at 0x11343d4e0>

In [ ]:


In [148]:
sns.jointplot(x="bsRate", y="PDR",  data=type_normal, kind="reg", color='g')
plt.title("PDR vs. bsRate, normal samples, jointplot")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[148]:
<matplotlib.text.Text at 0x113651208>

In [149]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=type_normal, kind="reg", color='g')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[149]:
<seaborn.axisgrid.JointGrid at 0x113502c18>

In [150]:
sns.jointplot(x="Unique_CpGs", y="PDR",  data=type_normal, kind="reg", color='g')
plt.title("Unique CpGs vs PDR")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[150]:
<matplotlib.text.Text at 0x113c713c8>

In [ ]:


In [151]:
# Can you try a model which just two variables: bs conversion and cll_vs-normal?

In [152]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.bio, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[152]:
<matplotlib.legend.Legend at 0x113b4f128>

In [153]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.type, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[153]:
<matplotlib.legend.Legend at 0x113ecdb70>

In [154]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.protocol, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[154]:
<matplotlib.legend.Legend at 0x11415f048>

In [155]:
y = scattermatrix7.PDR # dependent variable


X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)

In [156]:
X.shape


Out[156]:
(357, 2)

In [157]:
X.head()


Out[157]:
type bsRate
0 normal 0.959657
1 normal 0.958634
2 normal 0.958881
4 normal 0.958404
5 normal 0.959474

In [158]:
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [159]:
from sklearn import linear_model
clf = linear_model.LinearRegression()

clf.fit(X,y)


Out[159]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [160]:
X.columns


Out[160]:
Index(['bsRate', 'type_CLL', 'type_normal'], dtype='object')

In [161]:
print(clf.coef_)


[-3.14825912  0.03571414 -0.03571414]

In [162]:
clf.score(X,y)


Out[162]:
0.58294814270324924

In [163]:
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)


Out[163]:
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [164]:
print(clf.coef_)


[-0.  0. -0.]

In [165]:
clf.score(X,y)


Out[165]:
0.0

In [166]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)


Out[166]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=True, random_state=36,
           verbose=0, warm_start=False)

In [167]:
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: two variables: bs conversion and cll_vs-normal")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Out[167]:
<matplotlib.text.Text at 0x114394b70>

In [168]:
model.score(X,y)


Out[168]:
0.92777611231493695

In [169]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])
# Returns:	
# F : array, shape=(n_features,)
# F values of features.
# pval : array, shape=(n_features,)
# p-values of F-scores.


[  4.23184771e-52   8.25952531e-30   8.25952531e-30]

In [258]:
# multiple regression model describes the response as a weighted sum of the predictors:
# 'avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL'
#
# PDR = \beta_0 + \beta_1 \avgReadCpGs_mean + \beta_2 \Unique_CpGs + \beta_3 \bsRate, \beta_4 \type_CLL
#

In [259]:
import statsmodels.api as sm

In [263]:
y = scattermatrix7.PDR # dependent variable

X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)

In [264]:
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [265]:
X.columns


Out[265]:
Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')

In [266]:
X = X.drop('type_CLL', axis=1)

In [267]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()

In [268]:
est.summary()


Out[268]:
OLS Regression Results
Dep. Variable: PDR R-squared: 0.609
Model: OLS Adj. R-squared: 0.605
Method: Least Squares F-statistic: 137.2
Date: Wed, 06 Jul 2016 Prob (F-statistic): 1.61e-70
Time: 10:53:46 Log-Likelihood: 505.03
No. Observations: 357 AIC: -1000.
Df Residuals: 352 BIC: -980.7
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 4.1014 0.237 17.305 0.000 3.635 4.568
avgReadCpGs_mean -0.1167 0.024 -4.868 0.000 -0.164 -0.070
Unique_CpGs 7.763e-09 1.32e-08 0.587 0.557 -1.82e-08 3.38e-08
bsRate -3.1729 0.200 -15.836 0.000 -3.567 -2.779
type_normal -0.0727 0.008 -9.302 0.000 -0.088 -0.057
Omnibus: 53.943 Durbin-Watson: 1.128
Prob(Omnibus): 0.000 Jarque-Bera (JB): 14.655
Skew: 0.150 Prob(JB): 0.000657
Kurtosis: 2.054 Cond. No. 5.41e+07

In [280]:
y = scattermatrix7.PDR # dependent variable


X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)

In [281]:
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [282]:
X.columns


Out[282]:
Index(['bsRate', 'type_CLL', 'type_normal'], dtype='object')

In [283]:
X = X.drop('type_normal', axis=1)

In [428]:
X.columns


Out[428]:
Index(['const', 'bsRate', 'type_CLL'], dtype='object')

In [429]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()

In [430]:
est.summary()


Out[430]:
OLS Regression Results
Dep. Variable: PDR R-squared: 0.583
Model: OLS Adj. R-squared: 0.581
Method: Least Squares F-statistic: 247.4
Date: Wed, 06 Jul 2016 Prob (F-statistic): 5.94e-68
Time: 12:33:09 Log-Likelihood: 493.40
No. Observations: 357 AIC: -980.8
Df Residuals: 354 BIC: -969.2
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 3.3792 0.201 16.809 0.000 2.984 3.775
bsRate -3.1483 0.205 -15.380 0.000 -3.551 -2.746
type_CLL 0.0714 0.008 9.424 0.000 0.057 0.086
Omnibus: 57.549 Durbin-Watson: 1.112
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.257
Skew: 0.213 Prob(JB): 0.000295
Kurtosis: 2.045 Cond. No. 127.

In [ ]:


In [523]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']

In [ ]:


In [524]:
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

In [525]:
tritopool = tritopool.reset_index(drop=True)

In [526]:
tritopool.head()


Out[526]:
filename PDR total_reads avgReadCpGs_mean Unique_CpGs bsRate
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.368337 3443386.0 5.326650 995714 0.980280
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.378028 2036096.0 5.596456 705787 0.980081
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.372167 3063962.0 5.437243 865744 0.980305
3 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.384963 2109840.0 5.414803 634455 0.980256
4 RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.378931 3398241.0 5.477420 1018988 0.980243

In [ ]:


In [527]:
tritopoolA = tritopool.set_index("filename")

In [528]:
from itertools import combinations
cc = list(combinations(tritopool.filename,2))

In [529]:
len(cc)


Out[529]:
210

In [530]:
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)

In [531]:
out.head()


Out[531]:
PDR total_reads avgReadCpGs_mean Unique_CpGs bsRate
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACGTGG) 0.373183 2739741.0 5.461553 850750.5 0.980181
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACTCAC) 0.370252 3253674.0 5.381947 930729.0 0.980293
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATAGCG) 0.376650 2776613.0 5.370726 815084.5 0.980268
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATCGAC) 0.373634 3420813.5 5.402035 1007351.0 0.980262
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.CAAGAG) 0.374483 3369791.5 5.356506 995566.0 0.980228

In [532]:
tritopoolA


Out[532]:
PDR total_reads avgReadCpGs_mean Unique_CpGs bsRate
filename
RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.368337 3443386.0 5.326650 995714 0.980280
RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.378028 2036096.0 5.596456 705787 0.980081
RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.372167 3063962.0 5.437243 865744 0.980305
RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.384963 2109840.0 5.414803 634455 0.980256
RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.378931 3398241.0 5.477420 1018988 0.980243
RRBS_trito_pool_1_TAAGGCGA.CAAGAG 0.380629 3296197.0 5.386361 995418 0.980176
RRBS_trito_pool_1_TAAGGCGA.CATGAC 0.371924 3363701.0 5.507042 1027388 0.980220
RRBS_trito_pool_1_TAAGGCGA.CCTTCG 0.377104 1976743.0 5.288422 681439 0.980113
RRBS_trito_pool_1_TAAGGCGA.CGGTAG 0.387322 2428467.0 5.324041 783696 0.980278
RRBS_trito_pool_1_TAAGGCGA.CTATTG 0.375160 4140444.0 5.421629 1091089 0.980508
RRBS_trito_pool_1_TAAGGCGA.GACACG 0.375325 2015619.0 5.430819 753481 0.980255
RRBS_trito_pool_1_TAAGGCGA.GCATTC 0.375331 2985106.0 5.374387 937896 0.980234
RRBS_trito_pool_1_TAAGGCGA.GCTGCC 0.393086 965553.0 5.293828 524241 0.979914
RRBS_trito_pool_1_TAAGGCGA.GGCATC 0.384677 2212326.0 5.404690 835577 0.980118
RRBS_trito_pool_1_TAAGGCGA.GTGAGG 0.380027 2357929.0 5.434248 832135 0.980150
RRBS_trito_pool_1_TAAGGCGA.GTTGAG 0.378241 2902240.0 5.460279 919081 0.980045
RRBS_trito_pool_1_TAAGGCGA.TAGCGG 0.385324 1957341.0 5.578051 750879 0.980104
RRBS_trito_pool_1_TAAGGCGA.TATCTC 0.370555 3699347.0 5.385500 1033019 0.980429
RRBS_trito_pool_1_TAAGGCGA.TCTCTG 0.372230 3195035.0 5.507866 983183 0.980216
RRBS_trito_pool_1_TAAGGCGA.TGACAG 0.378355 2915362.0 5.445355 911433 0.980182
RRBS_trito_pool_1_TAAGGCGA.TGCTGC 0.386571 1754314.0 5.436130 758364 0.979957

In [533]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR, tritopool.PDR)), tritopool.filename, tritopool.filename)

In [534]:
df_ex.head()


Out[534]:
filename RRBS_trito_pool_1_TAAGGCGA.ACAACC RRBS_trito_pool_1_TAAGGCGA.ACGTGG RRBS_trito_pool_1_TAAGGCGA.ACTCAC RRBS_trito_pool_1_TAAGGCGA.ATAGCG RRBS_trito_pool_1_TAAGGCGA.ATCGAC RRBS_trito_pool_1_TAAGGCGA.CAAGAG RRBS_trito_pool_1_TAAGGCGA.CATGAC RRBS_trito_pool_1_TAAGGCGA.CCTTCG RRBS_trito_pool_1_TAAGGCGA.CGGTAG RRBS_trito_pool_1_TAAGGCGA.CTATTG RRBS_trito_pool_1_TAAGGCGA.GACACG RRBS_trito_pool_1_TAAGGCGA.GCATTC RRBS_trito_pool_1_TAAGGCGA.GCTGCC RRBS_trito_pool_1_TAAGGCGA.GGCATC RRBS_trito_pool_1_TAAGGCGA.GTGAGG RRBS_trito_pool_1_TAAGGCGA.GTTGAG RRBS_trito_pool_1_TAAGGCGA.TAGCGG RRBS_trito_pool_1_TAAGGCGA.TATCTC RRBS_trito_pool_1_TAAGGCGA.TCTCTG RRBS_trito_pool_1_TAAGGCGA.TGACAG RRBS_trito_pool_1_TAAGGCGA.TGCTGC
filename
RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.000000 0.009691 0.003830 0.016626 0.010594 0.012291 0.003587 0.008767 0.018985 0.006822 0.006988 0.006994 0.024748 0.016339 0.011690 0.009904 0.016987 0.002218 0.003893 0.010018 0.018234
RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.009691 0.000000 0.005861 0.006935 0.000903 0.002601 0.006104 0.000924 0.009294 0.002868 0.002703 0.002697 0.015058 0.006649 0.001999 0.000213 0.007296 0.007473 0.005798 0.000327 0.008543
RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.003830 0.005861 0.000000 0.012796 0.006764 0.008461 0.000243 0.004937 0.015155 0.002992 0.003158 0.003164 0.020918 0.012509 0.007860 0.006074 0.013157 0.001612 0.000063 0.006188 0.014404
RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.016626 0.006935 0.012796 0.000000 0.006032 0.004335 0.013040 0.007859 0.002359 0.009804 0.009638 0.009632 0.008122 0.000287 0.004937 0.006722 0.000360 0.014408 0.012733 0.006608 0.001608
RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.010594 0.000903 0.006764 0.006032 0.000000 0.001698 0.007007 0.001827 0.008391 0.003772 0.003606 0.003600 0.014155 0.005746 0.001096 0.000690 0.006393 0.008376 0.006701 0.000576 0.007640

In [535]:
row = df_ex.iloc[0]

In [536]:
row.values


Out[536]:
array([ 0.        ,  0.0096907 ,  0.00382999,  0.01662616,  0.01059389,
        0.01229149,  0.00358651,  0.008767  ,  0.01898497,  0.00682237,
        0.00698775,  0.00699391,  0.02474847,  0.01633943,  0.01168956,
        0.00990414,  0.01698661,  0.00221797,  0.0038932 ,  0.01001792,
        0.01823389])

In [537]:
pdr = tritopool[['filename', 'PDR']]

In [538]:
pdr = pdr.set_index("filename")

In [539]:
pdr


Out[539]:
PDR
filename
RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.368337
RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.378028
RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.372167
RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.384963
RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.378931
RRBS_trito_pool_1_TAAGGCGA.CAAGAG 0.380629
RRBS_trito_pool_1_TAAGGCGA.CATGAC 0.371924
RRBS_trito_pool_1_TAAGGCGA.CCTTCG 0.377104
RRBS_trito_pool_1_TAAGGCGA.CGGTAG 0.387322
RRBS_trito_pool_1_TAAGGCGA.CTATTG 0.375160
RRBS_trito_pool_1_TAAGGCGA.GACACG 0.375325
RRBS_trito_pool_1_TAAGGCGA.GCATTC 0.375331
RRBS_trito_pool_1_TAAGGCGA.GCTGCC 0.393086
RRBS_trito_pool_1_TAAGGCGA.GGCATC 0.384677
RRBS_trito_pool_1_TAAGGCGA.GTGAGG 0.380027
RRBS_trito_pool_1_TAAGGCGA.GTTGAG 0.378241
RRBS_trito_pool_1_TAAGGCGA.TAGCGG 0.385324
RRBS_trito_pool_1_TAAGGCGA.TATCTC 0.370555
RRBS_trito_pool_1_TAAGGCGA.TCTCTG 0.372230
RRBS_trito_pool_1_TAAGGCGA.TGACAG 0.378355
RRBS_trito_pool_1_TAAGGCGA.TGCTGC 0.386571

In [540]:
stacked = df_ex.stack()

In [541]:
#stacked.index

In [542]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)

In [543]:
pdr_differences.head()


Out[543]:
filename PDR_difference
0 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.000000
1 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.009691
2 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.003830
3 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.016626
4 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.010594

In [544]:
out['filename'] = out.index

In [545]:
out = out.reset_index(drop=True)

In [546]:
out.head()


Out[546]:
PDR total_reads avgReadCpGs_mean Unique_CpGs bsRate filename
0 0.373183 2739741.0 5.461553 850750.5 0.980181 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
1 0.370252 3253674.0 5.381947 930729.0 0.980293 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
2 0.376650 2776613.0 5.370726 815084.5 0.980268 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
3 0.373634 3420813.5 5.402035 1007351.0 0.980262 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
4 0.374483 3369791.5 5.356506 995566.0 0.980228 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...

In [547]:
pairs = pd.merge(out, pdr_differences, how='inner')

In [548]:
pairs.head()


Out[548]:
PDR total_reads avgReadCpGs_mean Unique_CpGs bsRate filename PDR_difference
0 0.373183 2739741.0 5.461553 850750.5 0.980181 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.009691
1 0.370252 3253674.0 5.381947 930729.0 0.980293 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.003830
2 0.376650 2776613.0 5.370726 815084.5 0.980268 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.016626
3 0.373634 3420813.5 5.402035 1007351.0 0.980262 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.010594
4 0.374483 3369791.5 5.356506 995566.0 0.980228 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.012291

In [549]:
pairs.shape


Out[549]:
(210, 7)

In [550]:
"""
2. We need a regression model for predicting PDR distance between any two cells (highest priority): 
Calculate the absolute difference in PDR between any two cells. Choose only pairs of cells that are in the same batch to control for batch effects. 
For each pair, generate an average of the two cells for: BS rate, number of unique CpG, Median Average Read CpG per cell. 

Variables of the regression model are: 
- Number of unique CpGs per pair
- Median Average Read CpG per pair (or mean if normally distributed)
- BS rate per pair
- CLL or Normal per pair
What is the coefficient and the  P value for each variable for a model predicting PDR distance between two cells?


"""


Out[550]:
'\n2. We need a regression model for predicting PDR distance between any two cells (highest priority): \nCalculate the absolute difference in PDR between any two cells. Choose only pairs of cells that are in the same batch to control for batch effects. \nFor each pair, generate an average of the two cells for: BS rate, number of unique CpG, Median Average Read CpG per cell. \n\nVariables of the regression model are: \n- Number of unique CpGs per pair\n- Median Average Read CpG per pair (or mean if normally distributed)\n- BS rate per pair\n- CLL or Normal per pair\nWhat is the coefficient and the  P value for each variable for a model predicting PDR distance between two cells?\n\n\n'

In [551]:
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})

In [552]:
pairs.head()


Out[552]:
PDR total_reads_mean avgReadCpGs_mean Unique_CpGs_mean bsRate_mean filename PDR_difference
0 0.373183 2739741.0 5.461553 850750.5 0.980181 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.009691
1 0.370252 3253674.0 5.381947 930729.0 0.980293 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.003830
2 0.376650 2776613.0 5.370726 815084.5 0.980268 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.016626
3 0.373634 3420813.5 5.402035 1007351.0 0.980262 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.010594
4 0.374483 3369791.5 5.356506 995566.0 0.980228 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.012291

In [553]:
pairs.columns


Out[553]:
Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference'],
      dtype='object')

In [554]:
y = pairs.PDR_difference # dependent variable


X = pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

In [555]:
y.shape


Out[555]:
(210,)

In [556]:
X.shape


Out[556]:
(210, 3)

In [557]:
X.head()


Out[557]:
avgReadCpGs_mean Unique_CpGs_mean bsRate_mean
0 5.461553 850750.5 0.980181
1 5.381947 930729.0 0.980293
2 5.370726 815084.5 0.980268
3 5.402035 1007351.0 0.980262
4 5.356506 995566.0 0.980228

In [558]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()

In [559]:
est.summary()


Out[559]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.120
Model: OLS Adj. R-squared: 0.107
Method: Least Squares F-statistic: 9.359
Date: Wed, 06 Jul 2016 Prob (F-statistic): 7.92e-06
Time: 13:15:01 Log-Likelihood: 819.55
No. Observations: 210 AIC: -1631.
Df Residuals: 206 BIC: -1618.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 4.5340 4.732 0.958 0.339 -4.795 13.863
avgReadCpGs_mean -0.0253 0.006 -3.938 0.000 -0.038 -0.013
Unique_CpGs_mean -7.068e-09 4.46e-09 -1.584 0.115 -1.59e-08 1.73e-09
bsRate_mean -4.4719 4.822 -0.927 0.355 -13.979 5.035
Omnibus: 9.333 Durbin-Watson: 1.670
Prob(Omnibus): 0.009 Jarque-Bera (JB): 9.295
Skew: 0.475 Prob(JB): 0.00959
Kurtosis: 2.599 Cond. No. 1.72e+10

In [560]:
pairs.columns


Out[560]:
Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference'],
      dtype='object')

In [585]:
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[585]:
<matplotlib.text.Text at 0x11d422c18>

In [587]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[587]:
<matplotlib.text.Text at 0x11d9a2ba8>

In [588]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[588]:
<matplotlib.text.Text at 0x11dcbc908>

In [564]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Out[564]:
<matplotlib.text.Text at 0x11c2555c0>

In [565]:
model.score(X,y) # RF model score


Out[565]:
0.84237125919799527

In [567]:
# cw154_Tris_protease 
# NormalBCD19pCD27pcell23_44
# NormalBCD19mCD27pcell23_44
# normal_B_cell_C1_24

In [568]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']

In [569]:
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

In [570]:
cw154 = cw154.reset_index(drop=True)

In [571]:
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)

In [572]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)

In [573]:
stacked = df_ex.stack()

In [574]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)

In [575]:
pdr_differences.head()


Out[575]:
filename PDR_difference
0 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.000000
1 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.021444
2 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.025515
3 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.000277
4 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.012904

In [576]:
out['filename'] = out.index
out = out.reset_index(drop=True)

In [577]:
pairs2 = pd.merge(out, pdr_differences, how='inner')

In [578]:
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})

In [579]:
pairs2.head()


Out[579]:
PDR total_reads_mean avgReadCpGs_mean Unique_CpGs_mean bsRate_mean filename PDR_difference
0 0.424997 1287881.0 5.522677 336578.0 0.961075 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.021444
1 0.427033 1363167.0 5.542956 339940.0 0.961160 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.025515
2 0.414414 2138379.5 5.346094 518622.0 0.961080 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.000277
3 0.420727 1798187.5 5.599726 425069.0 0.961269 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.012904
4 0.415530 1772492.0 5.529529 429952.0 0.961256 (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB... 0.002509

In [582]:
y = pairs2.PDR_difference # dependent variable

X = pairs2.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

In [583]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()


Out[583]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.100
Model: OLS Adj. R-squared: 0.087
Method: Least Squares F-statistic: 7.599
Date: Wed, 06 Jul 2016 Prob (F-statistic): 7.62e-05
Time: 13:16:34 Log-Likelihood: 687.17
No. Observations: 210 AIC: -1366.
Df Residuals: 206 BIC: -1353.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 5.4343 4.155 1.308 0.192 -2.757 13.626
avgReadCpGs_mean 0.0278 0.008 3.326 0.001 0.011 0.044
Unique_CpGs_mean -1.079e-08 5.93e-09 -1.821 0.070 -2.25e-08 8.95e-10
bsRate_mean -5.7956 4.349 -1.333 0.184 -14.369 2.778
Omnibus: 15.543 Durbin-Watson: 1.485
Prob(Omnibus): 0.000 Jarque-Bera (JB): 17.018
Skew: 0.690 Prob(JB): 0.000202
Kurtosis: 3.198 Cond. No. 3.92e+09

In [589]:
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[589]:
<matplotlib.text.Text at 0x11dfe3358>

In [590]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[590]:
<matplotlib.text.Text at 0x11e23d438>

In [591]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[591]:
<matplotlib.text.Text at 0x11e498c18>

In [595]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
0.859038365117

In [596]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']

pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

pcell = pcell.reset_index(drop=True)

In [597]:
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)

In [598]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)

In [599]:
stacked = df_ex.stack()

In [600]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)

In [601]:
out['filename'] = out.index
out = out.reset_index(drop=True)

In [602]:
pairs3 = pd.merge(out, pdr_differences, how='inner')

pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})

In [603]:
y = pairs3.PDR_difference # dependent variable

X = pairs3.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()


Out[603]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.024
Model: OLS Adj. R-squared: 0.011
Method: Least Squares F-statistic: 1.885
Date: Wed, 06 Jul 2016 Prob (F-statistic): 0.133
Time: 13:32:30 Log-Likelihood: 504.82
No. Observations: 231 AIC: -1002.
Df Residuals: 227 BIC: -987.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -45.9069 45.544 -1.008 0.315 -135.651 43.837
avgReadCpGs_mean 0.0452 0.023 1.943 0.053 -0.001 0.091
Unique_CpGs_mean 1.315e-08 2.77e-08 0.475 0.635 -4.14e-08 6.77e-08
bsRate_mean 45.8071 45.629 1.004 0.316 -44.104 135.719
Omnibus: 26.931 Durbin-Watson: 1.515
Prob(Omnibus): 0.000 Jarque-Bera (JB): 32.939
Skew: 0.912 Prob(JB): 7.04e-08
Kurtosis: 3.308 Cond. No. 1.28e+10

In [604]:
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 pcell ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[604]:
<matplotlib.text.Text at 0x11ecbe828>

In [605]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 pcell")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[605]:
<matplotlib.text.Text at 0x11f02bd68>

In [606]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 pcell")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[606]:
<matplotlib.text.Text at 0x11f31df98>

In [607]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
0.832634013746

In [623]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']

mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

mcell = mcell.reset_index(drop=True)

In [624]:
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)

In [625]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)

In [626]:
stacked = df_ex.stack()

In [627]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)

In [628]:
out['filename'] = out.index
out = out.reset_index(drop=True)

In [629]:
pairs4 = pd.merge(out, pdr_differences, how='inner')

pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})

In [632]:
y = pairs4.PDR_difference # dependent variable

X = pairs4.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()


Out[632]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.082
Model: OLS Adj. R-squared: 0.066
Method: Least Squares F-statistic: 4.997
Date: Wed, 06 Jul 2016 Prob (F-statistic): 0.00241
Time: 13:38:43 Log-Likelihood: 438.62
No. Observations: 171 AIC: -869.2
Df Residuals: 167 BIC: -856.7
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 13.0832 55.287 0.237 0.813 -96.069 122.235
avgReadCpGs_mean -0.0481 0.040 -1.216 0.226 -0.126 0.030
Unique_CpGs_mean 5.067e-08 2.61e-08 1.942 0.054 -8.35e-10 1.02e-07
bsRate_mean -12.8568 55.427 -0.232 0.817 -122.286 96.572
Omnibus: 74.640 Durbin-Watson: 0.319
Prob(Omnibus): 0.000 Jarque-Bera (JB): 185.034
Skew: 1.960 Prob(JB): 6.61e-41
Kurtosis: 6.255 Cond. No. 1.91e+10

In [633]:
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 mcell ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[633]:
<matplotlib.text.Text at 0x11f759e48>

In [634]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 mcell")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[634]:
<matplotlib.text.Text at 0x11f9f2a20>

In [635]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 mcell")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[635]:
<matplotlib.text.Text at 0x11fd3a400>

In [636]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
0.807421007306

In [639]:
pairs['type'] = str('CLL')

In [643]:
pairs2['type'] = str('CLL')

In [647]:
pairs3['type'] = str('normal')

In [648]:
pairs4['type'] = str('normal')

In [649]:
frames22 = [pairs, pairs2, pairs3, pairs4]

total_pairs = pd.concat(frames22)

In [650]:
total_pairs.head()


Out[650]:
PDR total_reads_mean avgReadCpGs_mean Unique_CpGs_mean bsRate_mean filename PDR_difference type
0 0.373183 2739741.0 5.461553 850750.5 0.980181 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.009691 CLL
1 0.370252 3253674.0 5.381947 930729.0 0.980293 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.003830 CLL
2 0.376650 2776613.0 5.370726 815084.5 0.980268 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.016626 CLL
3 0.373634 3420813.5 5.402035 1007351.0 0.980262 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.010594 CLL
4 0.374483 3369791.5 5.356506 995566.0 0.980228 (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito... 0.012291 CLL

In [651]:
total_pairs.columns


Out[651]:
Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference', 'type'],
      dtype='object')

In [ ]:


In [658]:
y = total_pairs.PDR_difference # dependent variable

X = total_pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

In [659]:
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [660]:
X.columns


Out[660]:
Index(['avgReadCpGs_mean', 'Unique_CpGs_mean', 'bsRate_mean', 'type_CLL',
       'type_normal'],
      dtype='object')

In [661]:
X = X.drop(['type_normal'], axis=1)

In [662]:
X.columns


Out[662]:
Index(['avgReadCpGs_mean', 'Unique_CpGs_mean', 'bsRate_mean', 'type_CLL'], dtype='object')

In [663]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()


Out[663]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.150
Model: OLS Adj. R-squared: 0.146
Method: Least Squares F-statistic: 36.09
Date: Wed, 06 Jul 2016 Prob (F-statistic): 8.43e-28
Time: 13:50:50 Log-Likelihood: 2076.0
No. Observations: 822 AIC: -4142.
Df Residuals: 817 BIC: -4118.
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.1874 0.192 0.977 0.329 -0.189 0.564
avgReadCpGs_mean -0.0066 0.009 -0.743 0.458 -0.024 0.011
Unique_CpGs_mean -7.19e-09 7.35e-09 -0.978 0.328 -2.16e-08 7.24e-09
bsRate_mean -0.1240 0.197 -0.628 0.530 -0.511 0.263
type_CLL -0.0167 0.007 -2.234 0.026 -0.031 -0.002
Omnibus: 279.133 Durbin-Watson: 1.091
Prob(Omnibus): 0.000 Jarque-Bera (JB): 861.832
Skew: 1.676 Prob(JB): 7.17e-188
Kurtosis: 6.733 Cond. No. 2.20e+08

In [664]:
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[664]:
<matplotlib.text.Text at 0x11c6e0cf8>

In [670]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[670]:
<matplotlib.text.Text at 0x121a640f0>

In [671]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[671]:
<matplotlib.text.Text at 0x1215a42e8>

In [672]:
total_pairs.columns


Out[672]:
Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference', 'type'],
      dtype='object')

In [673]:
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')


Out[673]:
<seaborn.axisgrid.FacetGrid at 0x121cf7b38>

In [674]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')


Out[674]:
<seaborn.axisgrid.FacetGrid at 0x1219b4780>

In [675]:
sns.lmplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')


Out[675]:
<seaborn.axisgrid.FacetGrid at 0x121a900f0>

In [676]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
0.87199641513

In [ ]:


In [ ]:


In [ ]:


In [ ]: