notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')



In [3]:

    
#
# CLL_cells_PDR_total_chr2_5_11A.csv
# CLL_cells_PDR_total_chr2_5_11B.csv
# CLL_cells_PDR_total_chr2_5_11_tritoC.csv
# Normal_cells_PDR_total_chr2_5_11A.csv
# Normal_cells_PDR_total_chr2_5_11B.csv
# Normal_cells_PDR_total_chr2_5_11C_mcells.csv
#



In [4]:

    
"""
RRBS_NormalBCD19pCD27pcell_CpGs.csv
RRBS_NormalBCD19pCD27mcell_CpGs.csv
Meth_PDR_cell_RRBS_normal_B1_CpGs.csv
Meth_PDR_cell_RRBS_trito_pool_CpGs.csv
CLL_RRBS_cw154_A_CpGs.csv
"""









    Out[4]:





'\nRRBS_NormalBCD19pCD27pcell_CpGs.csv\nRRBS_NormalBCD19pCD27mcell_CpGs.csv\nMeth_PDR_cell_RRBS_normal_B1_CpGs.csv\nMeth_PDR_cell_RRBS_trito_pool_CpGs.csv\nCLL_RRBS_cw154_A_CpGs.csv\n'



In [5]:

    
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1.csv")
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normal_mcell.csv")



In [ ]:



In [6]:

    
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)



In [7]:

    
normal_cellA_df["type"] = str('normal')



In [8]:

    
normal_cellA_df["bio"] = str('normal_B')



In [9]:

    
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]



In [10]:

    
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]



In [11]:

    
normal_cellA_df.head()









    Out[11]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.584639
      0.271471
      1142525.0
      530519.0
      1954240.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.524969
      0.435593
      318935.0
      264636.0
      607531.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.560134
      0.280063
      978219.0
      489103.0
      1746403.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.613036
      0.423883
      1345.0
      930.0
      2194.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.607458
      0.253306
      2177284.0
      907915.0
      3584255.0
      normal
      normal_B
      normal_B_cell_A1_24



In [12]:

    
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')



In [13]:

    
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)



In [14]:

    
cpg1["filename"] = cpg1["filename"].str[:40]



In [15]:

    
cpg1.head()









    Out[15]:






  
    
      
      filename
      avgReadCpGs_median
      avgReadCpGs_mean
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      4.0
      5.295301
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      4.0
      5.285714
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      5.0
      5.453122
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      4.0
      4.950166
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      5.0
      5.366276



In [16]:

    
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')



In [17]:

    
normal_cellB_df["type"] = str('normal')



In [18]:

    
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)



In [19]:

    
normal_cellB_df.head()









    Out[19]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
    
  
  
    
      0
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC...
      0.538553
      0.332004
      519881.0
      320493.0
      965329.0
      normal
    
    
      1
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG...
      0.398948
      0.274622
      61736.0
      42497.0
      154747.0
      normal
    
    
      2
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG...
      0.496259
      0.314214
      199.0
      126.0
      401.0
      normal
    
    
      3
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC...
      0.527652
      0.252527
      534097.0
      255611.0
      1012214.0
      normal
    
    
      4
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG...
      0.419489
      0.305627
      221463.0
      161351.0
      527935.0
      normal



In [20]:

    
normal_cellB_df["bio"] = str('CD27p')



In [21]:

    
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]



In [22]:

    
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]



In [23]:

    
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')



In [24]:

    
normal_cellB_df.head()









    Out[24]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
    
  
  
    
      0
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC
      0.538553
      0.332004
      519881.0
      320493.0
      965329.0
      normal
      CD27p
      NormalBCD19pCD27pcell1_22_
    
    
      1
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG
      0.398948
      0.274622
      61736.0
      42497.0
      154747.0
      normal
      CD27p
      NormalBCD19pCD27pcell1_22_
    
    
      2
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG
      0.496259
      0.314214
      199.0
      126.0
      401.0
      normal
      CD27p
      NormalBCD19pCD27pcell1_22_
    
    
      3
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC
      0.527652
      0.252527
      534097.0
      255611.0
      1012214.0
      normal
      CD27p
      NormalBCD19pCD27pcell1_22_
    
    
      4
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG
      0.419489
      0.305627
      221463.0
      161351.0
      527935.0
      normal
      CD27p
      NormalBCD19pCD27pcell1_22_



In [25]:

    
normal_cellB_df.tail()









    Out[25]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
    
  
  
    
      85
      RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.GTTGAG
      0.512097
      0.319441
      513330.0
      320210.0
      1002408.0
      normal
      CD27p
      NormalBCD19pCD27pcell67_88
    
    
      86
      RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TAGCGG
      0.492260
      0.307343
      277618.0
      173331.0
      563966.0
      normal
      CD27p
      NormalBCD19pCD27pcell67_88
    
    
      87
      RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TATCTC
      0.497594
      0.368790
      574159.0
      425536.0
      1153870.0
      normal
      CD27p
      NormalBCD19pCD27pcell67_88
    
    
      88
      RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TCTCTG
      0.523298
      0.339229
      415992.0
      269668.0
      794943.0
      normal
      CD27p
      NormalBCD19pCD27pcell67_88
    
    
      89
      RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TGCTGC
      0.478998
      0.177315
      4071.0
      1507.0
      8499.0
      normal
      CD27p
      NormalBCD19pCD27pcell67_88



In [26]:

    
cpg2 = pd.read_csv('RRBS_NormalBCD19pCD27pcell_CpGs.csv')



In [27]:

    
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)



In [28]:

    
cpg2["filename"] = cpg2["filename"].str[:50]



In [29]:

    
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')



In [30]:

    
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')



In [31]:

    
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)



In [32]:

    
normal_cellC_df["type"] = str('normal')



In [33]:

    
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]



In [34]:

    
normal_cellC_df["bio"] = str('CD27m')



In [35]:

    
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]



In [36]:

    
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')



In [ ]:



In [37]:

    
cpg3 = pd.read_csv("RRBS_NormalBCD19pCD27mcell_CpGs.csv")



In [38]:

    
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)



In [39]:

    
cpg3["filename"] = cpg3["filename"].str[:50]



In [40]:

    
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')



In [41]:

    
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')



In [42]:

    
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]

normal_result = pd.concat(frames3)



In [43]:

    
normal_result.shape









    Out[43]:





(304, 11)



In [44]:

    
normal_result.columns









    Out[44]:





Index(['PDR_total', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation', 'mixedReadCount', 'protocol', 'thisMeth',
       'total_reads', 'type'],
      dtype='object')



In [45]:

    
normal_result = normal_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]



In [46]:

    
normal_result.head()









    Out[46]:






  
    
      
      filename
      PDR_total
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.271471
      1954240.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.435593
      607531.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.280063
      1746403.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.423883
      2194.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.253306
      3584255.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0



In [47]:

    
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_A.csv")
# cll_cellB_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_NormalB_CLL_B.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C.csv")



In [ ]:



In [48]:

    
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)



In [49]:

    
cll_cellA_df["type"] = str('CLL')



In [50]:

    
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]



In [51]:

    
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [52]:

    
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [53]:

    
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [ ]:



In [54]:

    
cll_cellA_df["bio"] = str('CLL')



In [55]:

    
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]



In [56]:

    
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')



In [ ]:



In [57]:

    
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')



In [58]:

    
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)



In [59]:

    
cpg4["filename"] = cpg4["filename"].str[:51]



In [60]:

    
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')



In [61]:

    
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')



In [ ]:



In [62]:

    
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)



In [63]:

    
cll_cellC_df["type"] = str('CLL')



In [64]:

    
cll_cellC_df["bio"] = str('CLL')



In [65]:

    
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]



In [66]:

    
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]



In [67]:

    
cll_cellC_df.head()









    Out[67]:






  
    
      
      filename
      methylation
      PDR_total
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      0.582926
      0.368337
      2007240.0
      1268327.0
      3443386.0
      CLL
      CLL
      trito_pool_1
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      0.563719
      0.378028
      1147787.0
      769701.0
      2036096.0
      CLL
      CLL
      trito_pool_1
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      0.579104
      0.372167
      1774353.0
      1140306.0
      3063962.0
      CLL
      CLL
      trito_pool_1
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      0.544310
      0.384963
      1148407.0
      812211.0
      2109840.0
      CLL
      CLL
      trito_pool_1
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      0.572090
      0.378931
      1944098.0
      1287699.0
      3398241.0
      CLL
      CLL
      trito_pool_1



In [ ]:



In [68]:

    
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')



In [69]:

    
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1)



In [70]:

    
cpg5["filename"] = cpg5["filename"].str[:33]



In [71]:

    
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')



In [ ]:



In [72]:

    
frames2 = [cll_cellA_df, cll_cellC_df]

cll_result = pd.concat(frames2)



In [73]:

    
cll_result.shape









    Out[73]:





(110, 11)



In [74]:

    
cll_result.head()









    Out[74]:






  
    
      
      PDR_total
      avgReadCpGs_mean
      avgReadCpGs_median
      bio
      filename
      methylation
      mixedReadCount
      protocol
      thisMeth
      total_reads
      type
    
  
  
    
      0
      0.391815
      5.407442
      4.0
      CLL
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC
      0.572234
      721478.0
      cw154_CutSmart_proteinase_K
      1053697.0
      1841373.0
      CLL
    
    
      1
      0.401942
      5.341039
      4.0
      CLL
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG
      0.511848
      130878.0
      cw154_CutSmart_proteinase_K
      166665.0
      325614.0
      CLL
    
    
      2
      0.417094
      5.367599
      4.0
      CLL
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG
      0.531822
      457775.0
      cw154_CutSmart_proteinase_K
      583693.0
      1097534.0
      CLL
    
    
      3
      0.391057
      5.340834
      4.0
      CLL
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC
      0.579616
      698909.0
      cw154_CutSmart_proteinase_K
      1035906.0
      1787229.0
      CLL
    
    
      4
      0.389722
      5.459580
      5.0
      CLL
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG
      0.571888
      822828.0
      cw154_CutSmart_proteinase_K
      1207438.0
      2111320.0
      CLL



In [75]:

    
cll_result.columns









    Out[75]:





Index(['PDR_total', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation', 'mixedReadCount', 'protocol', 'thisMeth',
       'total_reads', 'type'],
      dtype='object')



In [76]:

    
cll_result = cll_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]



In [77]:

    
cll_result = cll_result.reset_index(drop=True)



In [78]:

    
normal_result = normal_result.reset_index(drop=True)



In [79]:

    
combined2 = normal_result.append(cll_result)



In [80]:

    
combined2 = combined2.reset_index(drop=True)



In [81]:

    
combined2.head()









    Out[81]:






  
    
      
      filename
      PDR_total
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.271471
      1954240.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.435593
      607531.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.280063
      1746403.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.423883
      2194.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.253306
      3584255.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0



In [ ]:



In [82]:

    
# Variables
#
# totCpG ---- Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# bsRate ---- BS rate per cell
#  - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting PDR?



In [ ]:



In [83]:

    
bs = pd.read_table('allStats.txt')

bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)



In [84]:

    
bs = bs.reset_index(drop=True)



In [85]:

    
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)

bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)



In [86]:

    
bs.head()









    Out[86]:






  
    
      
      sample
      totCpG
      bsRate
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      995714
      0.980280
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      705787
      0.980081
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      865744
      0.980305
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.AGGATG
      955160
      0.980392
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      634455
      0.980256



In [87]:

    
bs = bs.rename(columns = {'sample':'filename'})



In [ ]:



In [88]:

    
merged = pd.merge(combined2, bs, how='inner')



In [89]:

    
merged = merged.reset_index(drop=True)



In [90]:

    
merged = merged.rename(columns = {'PDR_total':'PDR'})



In [91]:

    
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})



In [92]:

    
merged.head()









    Out[92]:






  
    
      
      filename
      PDR
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
      Unique_CpGs
      bsRate
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.271471
      1954240.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
      178825
      0.959657
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.435593
      607531.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
      86434
      0.958634
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.280063
      1746403.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
      186115
      0.958881
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.423883
      2194.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
      6950
      0.958175
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.253306
      3584255.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0
      289150
      0.958404



In [93]:

    
# Remove all data points with less than 100k in totcpg 

merged = merged[merged['total_reads'] > 100000]



In [94]:

    
scattermatrix1 = merged.drop(['filename', 'total_reads', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)



In [95]:

    
scattermatrix1.head()









    Out[95]:






  
    
      
      PDR
      bio
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
  
  
    
      0
      0.271471
      normal_B
      5.295301
      178825
      0.959657
    
    
      1
      0.435593
      normal_B
      5.285714
      86434
      0.958634
    
    
      2
      0.280063
      normal_B
      5.453122
      186115
      0.958881
    
    
      4
      0.253306
      normal_B
      5.366276
      289150
      0.958404
    
    
      5
      0.443717
      normal_B
      5.269581
      169511
      0.959474



In [96]:

    
#sns.lmplot(x="bsRate", y="PDR", row="bio", col="Unique_CpGs", data=scattermatrix1)



In [97]:

    
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix1)









    Out[97]:





<seaborn.axisgrid.FacetGrid at 0x1078a0cc0>



In [ ]:



In [ ]:



In [ ]:



In [98]:

    
sns.pairplot(scattermatrix1, hue='bio')
plt.title('Scatterplot, bio')









    Out[98]:





<matplotlib.text.Text at 0x10801c240>



In [99]:

    
scattermatrix2 = merged.drop(['filename', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)



In [100]:

    
scattermatrix2.head()









    Out[100]:






  
    
      
      PDR
      total_reads
      bio
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
  
  
    
      0
      0.271471
      1954240.0
      normal_B
      5.295301
      178825
      0.959657
    
    
      1
      0.435593
      607531.0
      normal_B
      5.285714
      86434
      0.958634
    
    
      2
      0.280063
      1746403.0
      normal_B
      5.453122
      186115
      0.958881
    
    
      4
      0.253306
      3584255.0
      normal_B
      5.366276
      289150
      0.958404
    
    
      5
      0.443717
      1863914.0
      normal_B
      5.269581
      169511
      0.959474



In [101]:

    
sns.pairplot(scattermatrix2, hue='bio')









    Out[101]:





<seaborn.axisgrid.PairGrid at 0x103858ba8>



In [ ]:



In [ ]:



In [228]:

    
merged.head()









    Out[228]:






  
    
      
      filename
      PDR
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
      Unique_CpGs
      bsRate
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.271471
      1954240.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
      178825
      0.959657
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.435593
      607531.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
      86434
      0.958634
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.280063
      1746403.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
      186115
      0.958881
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.253306
      3584255.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0
      289150
      0.958404
    
    
      5
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ATAGCG
      0.443717
      1863914.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.269581
      4.0
      169511
      0.959474



In [229]:

    
scattermatrix3 = merged.drop(['filename', 'total_reads', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)



In [230]:

    
sns.pairplot(scattermatrix3, hue='type', palette='Set2')









    Out[230]:





<seaborn.axisgrid.PairGrid at 0x11477c668>



In [231]:

    
scattermatrix4 = merged.drop(['filename', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)



In [232]:

    
sns.pairplot(scattermatrix4, hue='type', palette='Set1')









    Out[232]:





<seaborn.axisgrid.PairGrid at 0x115778da0>



In [233]:

    
scattermatrix5 = merged.drop(['filename','total_reads', 'bio', 'type', 'avgReadCpGs_median'], axis=1)



In [234]:

    
sns.pairplot(scattermatrix5, hue='protocol')









    Out[234]:





<seaborn.axisgrid.PairGrid at 0x1147918d0>



In [235]:

    
scattermatrix6 = merged.drop(['filename', 'bio', 'type', 'avgReadCpGs_median'], axis=1)



In [236]:

    
sns.pairplot(scattermatrix6, hue='protocol')









    Out[236]:





<seaborn.axisgrid.PairGrid at 0x11633c1d0>



In [237]:

    
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)



In [238]:

    
scattermatrix7.columns









    Out[238]:





Index(['PDR', 'total_reads', 'type', 'protocol', 'avgReadCpGs_mean',
       'Unique_CpGs', 'bsRate'],
      dtype='object')



In [239]:

    
y = scattermatrix7.PDR # dependent variable



In [ ]:



In [240]:

    
y.shape









    Out[240]:





(357,)



In [241]:

    
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)



In [242]:

    
X.shape









    Out[242]:





(357, 4)



In [243]:

    
X.head()









    Out[243]:






  
    
      
      type
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
  
  
    
      0
      normal
      5.295301
      178825
      0.959657
    
    
      1
      normal
      5.285714
      86434
      0.958634
    
    
      2
      normal
      5.453122
      186115
      0.958881
    
    
      4
      normal
      5.366276
      289150
      0.958404
    
    
      5
      normal
      5.269581
      169511
      0.959474



In [244]:

    
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)



In [245]:

    
X.head()









    Out[245]:






  
    
      
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
      type_CLL
      type_normal
    
  
  
    
      0
      5.295301
      178825
      0.959657
      0.0
      1.0
    
    
      1
      5.285714
      86434
      0.958634
      0.0
      1.0
    
    
      2
      5.453122
      186115
      0.958881
      0.0
      1.0
    
    
      4
      5.366276
      289150
      0.958404
      0.0
      1.0
    
    
      5
      5.269581
      169511
      0.959474
      0.0
      1.0



In [246]:

    
# only use type_CLL
X_CLL = X.drop("type_normal", axis=1)

X_CLL.head()









    Out[246]:






  
    
      
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
      type_CLL
    
  
  
    
      0
      5.295301
      178825
      0.959657
      0.0
    
    
      1
      5.285714
      86434
      0.958634
      0.0
    
    
      2
      5.453122
      186115
      0.958881
      0.0
    
    
      4
      5.366276
      289150
      0.958404
      0.0
    
    
      5
      5.269581
      169511
      0.959474
      0.0



In [247]:

    
from sklearn import linear_model
clf = linear_model.LinearRegression()

clf.fit(X,y)









    Out[247]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [248]:

    
X.columns









    Out[248]:





Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')



In [249]:

    
print(clf.coef_)









    



[ -1.16707912e-01   7.76340513e-09  -3.17294437e+00   3.63312411e-02
  -3.63312411e-02]



In [250]:

    
clf.score(X,y) # Returns the coefficient of determination R^2 of the prediction.









    Out[250]:





0.60925536710711237



In [251]:

    
import sklearn
sklearn.feature_selection.f_regression(X, y)
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X, y)[1])









    



p-values are: 
[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30
   8.25952531e-30]



In [252]:

    
clf = linear_model.LinearRegression()

clf.fit(X_CLL,y)  # it's either CLL 0 or CLL 1









    Out[252]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [253]:

    
X_CLL.columns









    Out[253]:





Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL'], dtype='object')



In [254]:

    
print(clf.coef_)









    



[ -1.16707912e-01   7.76340513e-09  -3.17294437e+00   7.26624822e-02]



In [255]:

    
clf.score(X_CLL,y)









    Out[255]:





0.60925536710711237



In [256]:

    
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X_CLL, y)[1])









    



p-values are: 
[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30]



In [257]:

    
# Ridge

clf = linear_model.BayesianRidge()
clf.fit(X,y)









    Out[257]:





BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)



In [126]:

    
print(clf.coef_)









    



[ -1.16514657e-01   8.17383759e-09  -3.12193603e+00   3.66234875e-02
  -3.66234875e-02]



In [127]:

    
clf.score(X,y)









    Out[127]:





0.60918342000874781



In [128]:

    
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])









    



[  1.09875016e-01   7.81862013e-08   4.23184771e-52   8.25952531e-30
   8.25952531e-30]



In [129]:

    
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)









    Out[129]:





Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)



In [130]:

    
print(clf.coef_)   # why zero coefficients?









    



[ -0.00000000e+00   1.00643698e-07  -0.00000000e+00   0.00000000e+00
  -0.00000000e+00]



In [131]:

    
clf.score(X,y)     # Why is this so terrible?









    Out[131]:





0.078159204902939372



In [132]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)









    Out[132]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=True, random_state=36,
           verbose=0, warm_start=False)



In [133]:

    
X.columns









    Out[133]:





Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')



In [134]:

    
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: avgReadCpGs_mean, Unique_CpGs, bsRate, type")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    Out[134]:





<matplotlib.text.Text at 0x112548358>



In [135]:

    
model.score(X,y)









    Out[135]:





0.94122141182319152



In [136]:

    
scattermatrix7.columns









    Out[136]:





Index(['PDR', 'total_reads', 'type', 'protocol', 'avgReadCpGs_mean',
       'Unique_CpGs', 'bsRate'],
      dtype='object')



In [137]:

    
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")









    Out[137]:





<matplotlib.text.Text at 0x112598320>



In [138]:

    
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")









    Out[138]:





<matplotlib.text.Text at 0x112615198>



In [ ]:



In [139]:

    
sns.lmplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")









    Out[139]:





<matplotlib.text.Text at 0x11271d780>



In [140]:

    
sns.lmplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")









    Out[140]:





<matplotlib.text.Text at 0x11282a5f8>



In [ ]:



In [ ]:



In [ ]:



In [141]:

    
sns.jointplot(x="bsRate", y="PDR",  data=scattermatrix7, kind="reg")
plt.title("PDR vs. bsRate, jointplot")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[141]:





<matplotlib.text.Text at 0x112953438>



In [142]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=scattermatrix7, kind="reg")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[142]:





<seaborn.axisgrid.JointGrid at 0x112a7f630>



In [143]:

    
sns.jointplot(x="Unique_CpGs", y="PDR",  data=scattermatrix7, kind="reg")
plt.title("Unique CpGs vs PDR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[143]:





<matplotlib.text.Text at 0x112db9b70>



In [ ]:



In [144]:

    
type_cll = scattermatrix7[scattermatrix7.type == 'CLL']
type_normal = scattermatrix7[scattermatrix7.type == 'normal']



In [145]:

    
sns.jointplot(x="bsRate", y="PDR",  data=type_cll, kind="reg", color='r')
plt.title("PDR vs. bsRate, CLL samples, jointplot")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[145]:





<matplotlib.text.Text at 0x112fec550>



In [146]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=type_cll, kind="reg", color='r')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[146]:





<seaborn.axisgrid.JointGrid at 0x112cc4d30>



In [147]:

    
sns.jointplot(x="Unique_CpGs", y="PDR",  data=type_cll, kind="reg", color='r')
plt.title("Unique CpGs vs PDR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[147]:





<matplotlib.text.Text at 0x11343d4e0>



In [ ]:



In [148]:

    
sns.jointplot(x="bsRate", y="PDR",  data=type_normal, kind="reg", color='g')
plt.title("PDR vs. bsRate, normal samples, jointplot")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[148]:





<matplotlib.text.Text at 0x113651208>



In [149]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR",  data=type_normal, kind="reg", color='g')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[149]:





<seaborn.axisgrid.JointGrid at 0x113502c18>



In [150]:

    
sns.jointplot(x="Unique_CpGs", y="PDR",  data=type_normal, kind="reg", color='g')
plt.title("Unique CpGs vs PDR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[150]:





<matplotlib.text.Text at 0x113c713c8>



In [ ]:



In [151]:

    
# Can you try a model which just two variables: bs conversion and cll_vs-normal?



In [152]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.bio, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[152]:





<matplotlib.legend.Legend at 0x113b4f128>



In [153]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.type, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[153]:





<matplotlib.legend.Legend at 0x113ecdb70>



In [154]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.protocol, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)









    Out[154]:





<matplotlib.legend.Legend at 0x11415f048>



In [155]:

    
y = scattermatrix7.PDR # dependent variable


X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)



In [156]:

    
X.shape









    Out[156]:





(357, 2)



In [157]:

    
X.head()



In [158]:

    
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)



In [159]:

    
from sklearn import linear_model
clf = linear_model.LinearRegression()

clf.fit(X,y)









    Out[159]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [160]:

    
X.columns









    Out[160]:





Index(['bsRate', 'type_CLL', 'type_normal'], dtype='object')



In [161]:

    
print(clf.coef_)









    



[-3.14825912  0.03571414 -0.03571414]



In [162]:

    
clf.score(X,y)









    Out[162]:





0.58294814270324924



In [163]:

    
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)









    Out[163]:





Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)



In [164]:

    
print(clf.coef_)









    



[-0.  0. -0.]



In [165]:

    
clf.score(X,y)









    Out[165]:





0.0



In [166]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)









    Out[166]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=True, random_state=36,
           verbose=0, warm_start=False)



In [167]:

    
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: two variables: bs conversion and cll_vs-normal")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    Out[167]:





<matplotlib.text.Text at 0x114394b70>



In [168]:

    
model.score(X,y)









    Out[168]:





0.92777611231493695



In [169]:

    
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])
# Returns:	
# F : array, shape=(n_features,)
# F values of features.
# pval : array, shape=(n_features,)
# p-values of F-scores.









    



[  4.23184771e-52   8.25952531e-30   8.25952531e-30]



In [258]:

    
# multiple regression model describes the response as a weighted sum of the predictors:
# 'avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL'
#
# PDR = \beta_0 + \beta_1 \avgReadCpGs_mean + \beta_2 \Unique_CpGs + \beta_3 \bsRate, \beta_4 \type_CLL
#



In [259]:

    
import statsmodels.api as sm



In [263]:

    
y = scattermatrix7.PDR # dependent variable

X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)



In [264]:

    
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)



In [265]:

    
X.columns









    Out[265]:





Index(['avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL', 'type_normal'], dtype='object')



In [266]:

    
X = X.drop('type_CLL', axis=1)



In [267]:

    
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()



In [268]:

    
est.summary()









    Out[268]:





OLS Regression Results

  Dep. Variable:            PDR          R-squared:             0.609


  Model:                    OLS          Adj. R-squared:        0.605


  Method:              Least Squares     F-statistic:           137.2


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  1.61e-70


  Time:                  10:53:46        Log-Likelihood:       505.03


  No. Observations:          357         AIC:                  -1000.


  Df Residuals:              352         BIC:                  -980.7


  Df Model:                    4                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 4.1014      0.237     17.305   0.000      3.635     4.568


  avgReadCpGs_mean     -0.1167      0.024     -4.868   0.000     -0.164    -0.070


  Unique_CpGs        7.763e-09   1.32e-08      0.587   0.557  -1.82e-08  3.38e-08


  bsRate               -3.1729      0.200    -15.836   0.000     -3.567    -2.779


  type_normal          -0.0727      0.008     -9.302   0.000     -0.088    -0.057




  Omnibus:        53.943    Durbin-Watson:         1.128


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     14.655


  Skew:            0.150    Prob(JB):           0.000657


  Kurtosis:        2.054    Cond. No.           5.41e+07



In [280]:

    
y = scattermatrix7.PDR # dependent variable


X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)



In [281]:

    
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)



In [282]:

    
X.columns









    Out[282]:





Index(['bsRate', 'type_CLL', 'type_normal'], dtype='object')



In [283]:

    
X = X.drop('type_normal', axis=1)



In [428]:

    
X.columns









    Out[428]:





Index(['const', 'bsRate', 'type_CLL'], dtype='object')



In [429]:

    
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()



In [430]:

    
est.summary()









    Out[430]:





OLS Regression Results

  Dep. Variable:            PDR          R-squared:             0.583


  Model:                    OLS          Adj. R-squared:        0.581


  Method:              Least Squares     F-statistic:           247.4


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  5.94e-68


  Time:                  12:33:09        Log-Likelihood:       493.40


  No. Observations:          357         AIC:                  -980.8


  Df Residuals:              354         BIC:                  -969.2


  Df Model:                    2                                     


  Covariance Type:       nonrobust                                   




              coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const         3.3792      0.201     16.809   0.000      2.984     3.775


  bsRate       -3.1483      0.205    -15.380   0.000     -3.551    -2.746


  type_CLL      0.0714      0.008      9.424   0.000      0.057     0.086




  Omnibus:        57.549    Durbin-Watson:         1.112


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     16.257


  Skew:            0.213    Prob(JB):           0.000295


  Kurtosis:        2.045    Cond. No.               127.



In [ ]:



In [523]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']



In [ ]:



In [524]:

    
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)



In [525]:

    
tritopool = tritopool.reset_index(drop=True)



In [526]:

    
tritopool.head()









    Out[526]:






  
    
      
      filename
      PDR
      total_reads
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      0.368337
      3443386.0
      5.326650
      995714
      0.980280
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      0.378028
      2036096.0
      5.596456
      705787
      0.980081
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      0.372167
      3063962.0
      5.437243
      865744
      0.980305
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      0.384963
      2109840.0
      5.414803
      634455
      0.980256
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      0.378931
      3398241.0
      5.477420
      1018988
      0.980243



In [ ]:



In [527]:

    
tritopoolA = tritopool.set_index("filename")



In [528]:

    
from itertools import combinations
cc = list(combinations(tritopool.filename,2))



In [529]:

    
len(cc)









    Out[529]:





210



In [530]:

    
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)



In [531]:

    
out.head()









    Out[531]:






  
    
      
      PDR
      total_reads
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
  
  
    
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACGTGG)
      0.373183
      2739741.0
      5.461553
      850750.5
      0.980181
    
    
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACTCAC)
      0.370252
      3253674.0
      5.381947
      930729.0
      0.980293
    
    
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATAGCG)
      0.376650
      2776613.0
      5.370726
      815084.5
      0.980268
    
    
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATCGAC)
      0.373634
      3420813.5
      5.402035
      1007351.0
      0.980262
    
    
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.CAAGAG)
      0.374483
      3369791.5
      5.356506
      995566.0
      0.980228



In [532]:

    
tritopoolA









    Out[532]:






  
    
      
      PDR
      total_reads
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
    
    
      filename
      
      
      
      
      
    
  
  
    
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      0.368337
      3443386.0
      5.326650
      995714
      0.980280
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      0.378028
      2036096.0
      5.596456
      705787
      0.980081
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      0.372167
      3063962.0
      5.437243
      865744
      0.980305
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      0.384963
      2109840.0
      5.414803
      634455
      0.980256
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      0.378931
      3398241.0
      5.477420
      1018988
      0.980243
    
    
      RRBS_trito_pool_1_TAAGGCGA.CAAGAG
      0.380629
      3296197.0
      5.386361
      995418
      0.980176
    
    
      RRBS_trito_pool_1_TAAGGCGA.CATGAC
      0.371924
      3363701.0
      5.507042
      1027388
      0.980220
    
    
      RRBS_trito_pool_1_TAAGGCGA.CCTTCG
      0.377104
      1976743.0
      5.288422
      681439
      0.980113
    
    
      RRBS_trito_pool_1_TAAGGCGA.CGGTAG
      0.387322
      2428467.0
      5.324041
      783696
      0.980278
    
    
      RRBS_trito_pool_1_TAAGGCGA.CTATTG
      0.375160
      4140444.0
      5.421629
      1091089
      0.980508
    
    
      RRBS_trito_pool_1_TAAGGCGA.GACACG
      0.375325
      2015619.0
      5.430819
      753481
      0.980255
    
    
      RRBS_trito_pool_1_TAAGGCGA.GCATTC
      0.375331
      2985106.0
      5.374387
      937896
      0.980234
    
    
      RRBS_trito_pool_1_TAAGGCGA.GCTGCC
      0.393086
      965553.0
      5.293828
      524241
      0.979914
    
    
      RRBS_trito_pool_1_TAAGGCGA.GGCATC
      0.384677
      2212326.0
      5.404690
      835577
      0.980118
    
    
      RRBS_trito_pool_1_TAAGGCGA.GTGAGG
      0.380027
      2357929.0
      5.434248
      832135
      0.980150
    
    
      RRBS_trito_pool_1_TAAGGCGA.GTTGAG
      0.378241
      2902240.0
      5.460279
      919081
      0.980045
    
    
      RRBS_trito_pool_1_TAAGGCGA.TAGCGG
      0.385324
      1957341.0
      5.578051
      750879
      0.980104
    
    
      RRBS_trito_pool_1_TAAGGCGA.TATCTC
      0.370555
      3699347.0
      5.385500
      1033019
      0.980429
    
    
      RRBS_trito_pool_1_TAAGGCGA.TCTCTG
      0.372230
      3195035.0
      5.507866
      983183
      0.980216
    
    
      RRBS_trito_pool_1_TAAGGCGA.TGACAG
      0.378355
      2915362.0
      5.445355
      911433
      0.980182
    
    
      RRBS_trito_pool_1_TAAGGCGA.TGCTGC
      0.386571
      1754314.0
      5.436130
      758364
      0.979957



In [533]:

    
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR, tritopool.PDR)), tritopool.filename, tritopool.filename)



In [534]:

    
df_ex.head()









    Out[534]:






  
    
      filename
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      RRBS_trito_pool_1_TAAGGCGA.CAAGAG
      RRBS_trito_pool_1_TAAGGCGA.CATGAC
      RRBS_trito_pool_1_TAAGGCGA.CCTTCG
      RRBS_trito_pool_1_TAAGGCGA.CGGTAG
      RRBS_trito_pool_1_TAAGGCGA.CTATTG
      RRBS_trito_pool_1_TAAGGCGA.GACACG
      RRBS_trito_pool_1_TAAGGCGA.GCATTC
      RRBS_trito_pool_1_TAAGGCGA.GCTGCC
      RRBS_trito_pool_1_TAAGGCGA.GGCATC
      RRBS_trito_pool_1_TAAGGCGA.GTGAGG
      RRBS_trito_pool_1_TAAGGCGA.GTTGAG
      RRBS_trito_pool_1_TAAGGCGA.TAGCGG
      RRBS_trito_pool_1_TAAGGCGA.TATCTC
      RRBS_trito_pool_1_TAAGGCGA.TCTCTG
      RRBS_trito_pool_1_TAAGGCGA.TGACAG
      RRBS_trito_pool_1_TAAGGCGA.TGCTGC
    
    
      filename
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      0.000000
      0.009691
      0.003830
      0.016626
      0.010594
      0.012291
      0.003587
      0.008767
      0.018985
      0.006822
      0.006988
      0.006994
      0.024748
      0.016339
      0.011690
      0.009904
      0.016987
      0.002218
      0.003893
      0.010018
      0.018234
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      0.009691
      0.000000
      0.005861
      0.006935
      0.000903
      0.002601
      0.006104
      0.000924
      0.009294
      0.002868
      0.002703
      0.002697
      0.015058
      0.006649
      0.001999
      0.000213
      0.007296
      0.007473
      0.005798
      0.000327
      0.008543
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      0.003830
      0.005861
      0.000000
      0.012796
      0.006764
      0.008461
      0.000243
      0.004937
      0.015155
      0.002992
      0.003158
      0.003164
      0.020918
      0.012509
      0.007860
      0.006074
      0.013157
      0.001612
      0.000063
      0.006188
      0.014404
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      0.016626
      0.006935
      0.012796
      0.000000
      0.006032
      0.004335
      0.013040
      0.007859
      0.002359
      0.009804
      0.009638
      0.009632
      0.008122
      0.000287
      0.004937
      0.006722
      0.000360
      0.014408
      0.012733
      0.006608
      0.001608
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      0.010594
      0.000903
      0.006764
      0.006032
      0.000000
      0.001698
      0.007007
      0.001827
      0.008391
      0.003772
      0.003606
      0.003600
      0.014155
      0.005746
      0.001096
      0.000690
      0.006393
      0.008376
      0.006701
      0.000576
      0.007640



In [535]:

    
row = df_ex.iloc[0]



In [536]:

    
row.values









    Out[536]:





array([ 0.        ,  0.0096907 ,  0.00382999,  0.01662616,  0.01059389,
        0.01229149,  0.00358651,  0.008767  ,  0.01898497,  0.00682237,
        0.00698775,  0.00699391,  0.02474847,  0.01633943,  0.01168956,
        0.00990414,  0.01698661,  0.00221797,  0.0038932 ,  0.01001792,
        0.01823389])



In [537]:

    
pdr = tritopool[['filename', 'PDR']]



In [538]:

    
pdr = pdr.set_index("filename")



In [539]:

    
pdr









    Out[539]:






  
    
      
      PDR
    
    
      filename
      
    
  
  
    
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      0.368337
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      0.378028
    
    
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      0.372167
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      0.384963
    
    
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC
      0.378931
    
    
      RRBS_trito_pool_1_TAAGGCGA.CAAGAG
      0.380629
    
    
      RRBS_trito_pool_1_TAAGGCGA.CATGAC
      0.371924
    
    
      RRBS_trito_pool_1_TAAGGCGA.CCTTCG
      0.377104
    
    
      RRBS_trito_pool_1_TAAGGCGA.CGGTAG
      0.387322
    
    
      RRBS_trito_pool_1_TAAGGCGA.CTATTG
      0.375160
    
    
      RRBS_trito_pool_1_TAAGGCGA.GACACG
      0.375325
    
    
      RRBS_trito_pool_1_TAAGGCGA.GCATTC
      0.375331
    
    
      RRBS_trito_pool_1_TAAGGCGA.GCTGCC
      0.393086
    
    
      RRBS_trito_pool_1_TAAGGCGA.GGCATC
      0.384677
    
    
      RRBS_trito_pool_1_TAAGGCGA.GTGAGG
      0.380027
    
    
      RRBS_trito_pool_1_TAAGGCGA.GTTGAG
      0.378241
    
    
      RRBS_trito_pool_1_TAAGGCGA.TAGCGG
      0.385324
    
    
      RRBS_trito_pool_1_TAAGGCGA.TATCTC
      0.370555
    
    
      RRBS_trito_pool_1_TAAGGCGA.TCTCTG
      0.372230
    
    
      RRBS_trito_pool_1_TAAGGCGA.TGACAG
      0.378355
    
    
      RRBS_trito_pool_1_TAAGGCGA.TGCTGC
      0.386571



In [540]:

    
stacked = df_ex.stack()



In [541]:

    
#stacked.index



In [542]:

    
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)



In [543]:

    
pdr_differences.head()









    Out[543]:






  
    
      
      filename
      PDR_difference
    
  
  
    
      0
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.000000
    
    
      1
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.009691
    
    
      2
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.003830
    
    
      3
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.016626
    
    
      4
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.010594



In [544]:

    
out['filename'] = out.index



In [545]:

    
out = out.reset_index(drop=True)



In [546]:

    
out.head()









    Out[546]:






  
    
      
      PDR
      total_reads
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
      filename
    
  
  
    
      0
      0.373183
      2739741.0
      5.461553
      850750.5
      0.980181
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
    
    
      1
      0.370252
      3253674.0
      5.381947
      930729.0
      0.980293
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
    
    
      2
      0.376650
      2776613.0
      5.370726
      815084.5
      0.980268
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
    
    
      3
      0.373634
      3420813.5
      5.402035
      1007351.0
      0.980262
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
    
    
      4
      0.374483
      3369791.5
      5.356506
      995566.0
      0.980228
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...



In [547]:

    
pairs = pd.merge(out, pdr_differences, how='inner')



In [548]:

    
pairs.head()









    Out[548]:






  
    
      
      PDR
      total_reads
      avgReadCpGs_mean
      Unique_CpGs
      bsRate
      filename
      PDR_difference
    
  
  
    
      0
      0.373183
      2739741.0
      5.461553
      850750.5
      0.980181
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.009691
    
    
      1
      0.370252
      3253674.0
      5.381947
      930729.0
      0.980293
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.003830
    
    
      2
      0.376650
      2776613.0
      5.370726
      815084.5
      0.980268
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.016626
    
    
      3
      0.373634
      3420813.5
      5.402035
      1007351.0
      0.980262
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.010594
    
    
      4
      0.374483
      3369791.5
      5.356506
      995566.0
      0.980228
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.012291



In [549]:

    
pairs.shape









    Out[549]:





(210, 7)



In [550]:

    
"""
2. We need a regression model for predicting PDR distance between any two cells (highest priority): 
Calculate the absolute difference in PDR between any two cells. Choose only pairs of cells that are in the same batch to control for batch effects. 
For each pair, generate an average of the two cells for: BS rate, number of unique CpG, Median Average Read CpG per cell. 

Variables of the regression model are: 
- Number of unique CpGs per pair
- Median Average Read CpG per pair (or mean if normally distributed)
- BS rate per pair
- CLL or Normal per pair
What is the coefficient and the  P value for each variable for a model predicting PDR distance between two cells?


"""









    Out[550]:





'\n2. We need a regression model for predicting PDR distance between any two cells (highest priority): \nCalculate the absolute difference in PDR between any two cells. Choose only pairs of cells that are in the same batch to control for batch effects. \nFor each pair, generate an average of the two cells for: BS rate, number of unique CpG, Median Average Read CpG per cell. \n\nVariables of the regression model are: \n- Number of unique CpGs per pair\n- Median Average Read CpG per pair (or mean if normally distributed)\n- BS rate per pair\n- CLL or Normal per pair\nWhat is the coefficient and the  P value for each variable for a model predicting PDR distance between two cells?\n\n\n'



In [551]:

    
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})



In [552]:

    
pairs.head()









    Out[552]:






  
    
      
      PDR
      total_reads_mean
      avgReadCpGs_mean
      Unique_CpGs_mean
      bsRate_mean
      filename
      PDR_difference
    
  
  
    
      0
      0.373183
      2739741.0
      5.461553
      850750.5
      0.980181
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.009691
    
    
      1
      0.370252
      3253674.0
      5.381947
      930729.0
      0.980293
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.003830
    
    
      2
      0.376650
      2776613.0
      5.370726
      815084.5
      0.980268
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.016626
    
    
      3
      0.373634
      3420813.5
      5.402035
      1007351.0
      0.980262
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.010594
    
    
      4
      0.374483
      3369791.5
      5.356506
      995566.0
      0.980228
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.012291



In [553]:

    
pairs.columns









    Out[553]:





Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference'],
      dtype='object')



In [554]:

    
y = pairs.PDR_difference # dependent variable


X = pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)



In [555]:

    
y.shape









    Out[555]:





(210,)



In [556]:

    
X.shape









    Out[556]:





(210, 3)



In [557]:

    
X.head()









    Out[557]:






  
    
      
      avgReadCpGs_mean
      Unique_CpGs_mean
      bsRate_mean
    
  
  
    
      0
      5.461553
      850750.5
      0.980181
    
    
      1
      5.381947
      930729.0
      0.980293
    
    
      2
      5.370726
      815084.5
      0.980268
    
    
      3
      5.402035
      1007351.0
      0.980262
    
    
      4
      5.356506
      995566.0
      0.980228



In [558]:

    
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()



In [559]:

    
est.summary()









    Out[559]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.120


  Model:                    OLS          Adj. R-squared:        0.107


  Method:              Least Squares     F-statistic:           9.359


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  7.92e-06


  Time:                  13:15:01        Log-Likelihood:       819.55


  No. Observations:          210         AIC:                  -1631.


  Df Residuals:              206         BIC:                  -1618.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 4.5340      4.732      0.958   0.339     -4.795    13.863


  avgReadCpGs_mean     -0.0253      0.006     -3.938   0.000     -0.038    -0.013


  Unique_CpGs_mean  -7.068e-09   4.46e-09     -1.584   0.115  -1.59e-08  1.73e-09


  bsRate_mean          -4.4719      4.822     -0.927   0.355    -13.979     5.035




  Omnibus:         9.333    Durbin-Watson:         1.670


  Prob(Omnibus):   0.009    Jarque-Bera (JB):      9.295


  Skew:            0.475    Prob(JB):            0.00959


  Kurtosis:        2.599    Cond. No.           1.72e+10



In [560]:

    
pairs.columns









    Out[560]:





Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference'],
      dtype='object')



In [585]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[585]:





<matplotlib.text.Text at 0x11d422c18>



In [587]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[587]:





<matplotlib.text.Text at 0x11d9a2ba8>



In [588]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[588]:





<matplotlib.text.Text at 0x11dcbc908>



In [564]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    Out[564]:





<matplotlib.text.Text at 0x11c2555c0>



In [565]:

    
model.score(X,y) # RF model score









    Out[565]:





0.84237125919799527



In [567]:

    
# cw154_Tris_protease 
# NormalBCD19pCD27pcell23_44
# NormalBCD19mCD27pcell23_44
# normal_B_cell_C1_24



In [568]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']



In [569]:

    
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)



In [570]:

    
cw154 = cw154.reset_index(drop=True)



In [571]:

    
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)



In [572]:

    
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)



In [573]:

    
stacked = df_ex.stack()



In [574]:

    
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)



In [575]:

    
pdr_differences.head()









    Out[575]:






  
    
      
      filename
      PDR_difference
    
  
  
    
      0
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.000000
    
    
      1
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.021444
    
    
      2
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.025515
    
    
      3
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.000277
    
    
      4
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.012904



In [576]:

    
out['filename'] = out.index
out = out.reset_index(drop=True)



In [577]:

    
pairs2 = pd.merge(out, pdr_differences, how='inner')



In [578]:

    
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})



In [579]:

    
pairs2.head()









    Out[579]:






  
    
      
      PDR
      total_reads_mean
      avgReadCpGs_mean
      Unique_CpGs_mean
      bsRate_mean
      filename
      PDR_difference
    
  
  
    
      0
      0.424997
      1287881.0
      5.522677
      336578.0
      0.961075
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.021444
    
    
      1
      0.427033
      1363167.0
      5.542956
      339940.0
      0.961160
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.025515
    
    
      2
      0.414414
      2138379.5
      5.346094
      518622.0
      0.961080
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.000277
    
    
      3
      0.420727
      1798187.5
      5.599726
      425069.0
      0.961269
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.012904
    
    
      4
      0.415530
      1772492.0
      5.529529
      429952.0
      0.961256
      (RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...
      0.002509



In [582]:

    
y = pairs2.PDR_difference # dependent variable

X = pairs2.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)



In [583]:

    
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()









    Out[583]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.100


  Model:                    OLS          Adj. R-squared:        0.087


  Method:              Least Squares     F-statistic:           7.599


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  7.62e-05


  Time:                  13:16:34        Log-Likelihood:       687.17


  No. Observations:          210         AIC:                  -1366.


  Df Residuals:              206         BIC:                  -1353.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 5.4343      4.155      1.308   0.192     -2.757    13.626


  avgReadCpGs_mean      0.0278      0.008      3.326   0.001      0.011     0.044


  Unique_CpGs_mean  -1.079e-08   5.93e-09     -1.821   0.070  -2.25e-08  8.95e-10


  bsRate_mean          -5.7956      4.349     -1.333   0.184    -14.369     2.778




  Omnibus:        15.543    Durbin-Watson:         1.485


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     17.018


  Skew:            0.690    Prob(JB):           0.000202


  Kurtosis:        3.198    Cond. No.           3.92e+09



In [589]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[589]:





<matplotlib.text.Text at 0x11dfe3358>



In [590]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[590]:





<matplotlib.text.Text at 0x11e23d438>



In [591]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[591]:





<matplotlib.text.Text at 0x11e498c18>



In [595]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



0.859038365117



In [596]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']

pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

pcell = pcell.reset_index(drop=True)



In [597]:

    
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)



In [598]:

    
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)



In [599]:

    
stacked = df_ex.stack()



In [600]:

    
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)



In [601]:

    
out['filename'] = out.index
out = out.reset_index(drop=True)



In [602]:

    
pairs3 = pd.merge(out, pdr_differences, how='inner')

pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})



In [603]:

    
y = pairs3.PDR_difference # dependent variable

X = pairs3.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()









    Out[603]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.024


  Model:                    OLS          Adj. R-squared:        0.011


  Method:              Least Squares     F-statistic:           1.885


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):    0.133 


  Time:                  13:32:30        Log-Likelihood:       504.82


  No. Observations:          231         AIC:                  -1002.


  Df Residuals:              227         BIC:                  -987.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -45.9069     45.544     -1.008   0.315   -135.651    43.837


  avgReadCpGs_mean      0.0452      0.023      1.943   0.053     -0.001     0.091


  Unique_CpGs_mean   1.315e-08   2.77e-08      0.475   0.635  -4.14e-08  6.77e-08


  bsRate_mean          45.8071     45.629      1.004   0.316    -44.104   135.719




  Omnibus:        26.931    Durbin-Watson:         1.515


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     32.939


  Skew:            0.912    Prob(JB):           7.04e-08


  Kurtosis:        3.308    Cond. No.           1.28e+10



In [604]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 pcell ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[604]:





<matplotlib.text.Text at 0x11ecbe828>



In [605]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 pcell")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[605]:





<matplotlib.text.Text at 0x11f02bd68>



In [606]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 pcell")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[606]:





<matplotlib.text.Text at 0x11f31df98>



In [607]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



0.832634013746



In [623]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']

mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)

mcell = mcell.reset_index(drop=True)



In [624]:

    
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)



In [625]:

    
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)



In [626]:

    
stacked = df_ex.stack()



In [627]:

    
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)



In [628]:

    
out['filename'] = out.index
out = out.reset_index(drop=True)



In [629]:

    
pairs4 = pd.merge(out, pdr_differences, how='inner')

pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})



In [632]:

    
y = pairs4.PDR_difference # dependent variable

X = pairs4.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()









    Out[632]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.082


  Model:                    OLS          Adj. R-squared:        0.066


  Method:              Least Squares     F-statistic:           4.997


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):   0.00241


  Time:                  13:38:43        Log-Likelihood:       438.62


  No. Observations:          171         AIC:                  -869.2


  Df Residuals:              167         BIC:                  -856.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                13.0832     55.287      0.237   0.813    -96.069   122.235


  avgReadCpGs_mean     -0.0481      0.040     -1.216   0.226     -0.126     0.030


  Unique_CpGs_mean   5.067e-08   2.61e-08      1.942   0.054  -8.35e-10  1.02e-07


  bsRate_mean         -12.8568     55.427     -0.232   0.817   -122.286    96.572




  Omnibus:        74.640    Durbin-Watson:         0.319


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    185.034


  Skew:            1.960    Prob(JB):           6.61e-41


  Kurtosis:        6.255    Cond. No.           1.91e+10



In [633]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 mcell ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[633]:





<matplotlib.text.Text at 0x11f759e48>



In [634]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 mcell")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[634]:





<matplotlib.text.Text at 0x11f9f2a20>



In [635]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 mcell")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[635]:





<matplotlib.text.Text at 0x11fd3a400>



In [636]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



0.807421007306



In [639]:

    
pairs['type'] = str('CLL')



In [643]:

    
pairs2['type'] = str('CLL')



In [647]:

    
pairs3['type'] = str('normal')



In [648]:

    
pairs4['type'] = str('normal')



In [649]:

    
frames22 = [pairs, pairs2, pairs3, pairs4]

total_pairs = pd.concat(frames22)



In [650]:

    
total_pairs.head()









    Out[650]:






  
    
      
      PDR
      total_reads_mean
      avgReadCpGs_mean
      Unique_CpGs_mean
      bsRate_mean
      filename
      PDR_difference
      type
    
  
  
    
      0
      0.373183
      2739741.0
      5.461553
      850750.5
      0.980181
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.009691
      CLL
    
    
      1
      0.370252
      3253674.0
      5.381947
      930729.0
      0.980293
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.003830
      CLL
    
    
      2
      0.376650
      2776613.0
      5.370726
      815084.5
      0.980268
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.016626
      CLL
    
    
      3
      0.373634
      3420813.5
      5.402035
      1007351.0
      0.980262
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.010594
      CLL
    
    
      4
      0.374483
      3369791.5
      5.356506
      995566.0
      0.980228
      (RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito...
      0.012291
      CLL



In [651]:

    
total_pairs.columns









    Out[651]:





Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference', 'type'],
      dtype='object')



In [ ]:



In [658]:

    
y = total_pairs.PDR_difference # dependent variable

X = total_pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)



In [659]:

    
categorical_variables = ['type']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)



In [660]:

    
X.columns









    Out[660]:





Index(['avgReadCpGs_mean', 'Unique_CpGs_mean', 'bsRate_mean', 'type_CLL',
       'type_normal'],
      dtype='object')



In [661]:

    
X = X.drop(['type_normal'], axis=1)



In [662]:

    
X.columns









    Out[662]:





Index(['avgReadCpGs_mean', 'Unique_CpGs_mean', 'bsRate_mean', 'type_CLL'], dtype='object')



In [663]:

    
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()









    Out[663]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.150


  Model:                    OLS          Adj. R-squared:        0.146


  Method:              Least Squares     F-statistic:           36.09


  Date:              Wed, 06 Jul 2016    Prob (F-statistic):  8.43e-28


  Time:                  13:50:50        Log-Likelihood:       2076.0


  No. Observations:          822         AIC:                  -4142.


  Df Residuals:              817         BIC:                  -4118.


  Df Model:                    4                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 0.1874      0.192      0.977   0.329     -0.189     0.564


  avgReadCpGs_mean     -0.0066      0.009     -0.743   0.458     -0.024     0.011


  Unique_CpGs_mean   -7.19e-09   7.35e-09     -0.978   0.328  -2.16e-08  7.24e-09


  bsRate_mean          -0.1240      0.197     -0.628   0.530     -0.511     0.263


  type_CLL             -0.0167      0.007     -2.234   0.026     -0.031    -0.002




  Omnibus:        279.133    Durbin-Watson:         1.091 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    861.832 


  Skew:            1.676     Prob(JB):           7.17e-188


  Kurtosis:        6.733     Cond. No.           2.20e+08



In [664]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[664]:





<matplotlib.text.Text at 0x11c6e0cf8>



In [670]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[670]:





<matplotlib.text.Text at 0x121a640f0>



In [671]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[671]:





<matplotlib.text.Text at 0x1215a42e8>



In [672]:

    
total_pairs.columns









    Out[672]:





Index(['PDR', 'total_reads_mean', 'avgReadCpGs_mean', 'Unique_CpGs_mean',
       'bsRate_mean', 'filename', 'PDR_difference', 'type'],
      dtype='object')



In [673]:

    
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[673]:





<seaborn.axisgrid.FacetGrid at 0x121cf7b38>



In [674]:

    
sns.lmplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[674]:





<seaborn.axisgrid.FacetGrid at 0x1219b4780>



In [675]:

    
sns.lmplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[675]:





<seaborn.axisgrid.FacetGrid at 0x121a900f0>



In [676]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



0.87199641513



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	filename	methylation	PDR_total	thisMeth	mixedReadCount	total_reads	type	bio	protocol
0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC	0.584639	0.271471	1142525.0	530519.0	1954240.0	normal	normal_B	normal_B_cell_A1_24
1	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG	0.524969	0.435593	318935.0	264636.0	607531.0	normal	normal_B	normal_B_cell_A1_24
2	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG	0.560134	0.280063	978219.0	489103.0	1746403.0	normal	normal_B	normal_B_cell_A1_24
3	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC	0.613036	0.423883	1345.0	930.0	2194.0	normal	normal_B	normal_B_cell_A1_24
4	RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG	0.607458	0.253306	2177284.0	907915.0	3584255.0	normal	normal_B	normal_B_cell_A1_24

	filename	avgReadCpGs_median	avgReadCpGs_mean
0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC	4.0	5.295301
1	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG	4.0	5.285714
2	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG	5.0	5.453122
3	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC	4.0	4.950166
4	RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG	5.0	5.366276

	filename	methylation	PDR_total	thisMeth	mixedReadCount	total_reads	type
0	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC...	0.538553	0.332004	519881.0	320493.0	965329.0	normal
1	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG...	0.398948	0.274622	61736.0	42497.0	154747.0	normal
2	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG...	0.496259	0.314214	199.0	126.0	401.0	normal
3	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC...	0.527652	0.252527	534097.0	255611.0	1012214.0	normal
4	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG...	0.419489	0.305627	221463.0	161351.0	527935.0	normal

	filename	methylation	PDR_total	thisMeth	mixedReadCount	total_reads	type	bio	protocol
85	RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.GTTGAG	0.512097	0.319441	513330.0	320210.0	1002408.0	normal	CD27p	NormalBCD19pCD27pcell67_88
86	RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TAGCGG	0.492260	0.307343	277618.0	173331.0	563966.0	normal	CD27p	NormalBCD19pCD27pcell67_88
87	RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TATCTC	0.497594	0.368790	574159.0	425536.0	1153870.0	normal	CD27p	NormalBCD19pCD27pcell67_88
88	RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TCTCTG	0.523298	0.339229	415992.0	269668.0	794943.0	normal	CD27p	NormalBCD19pCD27pcell67_88
89	RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TGCTGC	0.478998	0.177315	4071.0	1507.0	8499.0	normal	CD27p	NormalBCD19pCD27pcell67_88

	filename	methylation	PDR_total	thisMeth	mixedReadCount	total_reads	type	bio	protocol
0	RRBS_trito_pool_1_TAAGGCGA.ACAACC	0.582926	0.368337	2007240.0	1268327.0	3443386.0	CLL	CLL	trito_pool_1
1	RRBS_trito_pool_1_TAAGGCGA.ACGTGG	0.563719	0.378028	1147787.0	769701.0	2036096.0	CLL	CLL	trito_pool_1
2	RRBS_trito_pool_1_TAAGGCGA.ACTCAC	0.579104	0.372167	1774353.0	1140306.0	3063962.0	CLL	CLL	trito_pool_1
3	RRBS_trito_pool_1_TAAGGCGA.ATAGCG	0.544310	0.384963	1148407.0	812211.0	2109840.0	CLL	CLL	trito_pool_1
4	RRBS_trito_pool_1_TAAGGCGA.ATCGAC	0.572090	0.378931	1944098.0	1287699.0	3398241.0	CLL	CLL	trito_pool_1

	PDR_total	avgReadCpGs_mean	avgReadCpGs_median	bio	filename	methylation	mixedReadCount	protocol	thisMeth	total_reads	type
0	0.391815	5.407442	4.0	CLL	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC	0.572234	721478.0	cw154_CutSmart_proteinase_K	1053697.0	1841373.0	CLL
1	0.401942	5.341039	4.0	CLL	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG	0.511848	130878.0	cw154_CutSmart_proteinase_K	166665.0	325614.0	CLL
2	0.417094	5.367599	4.0	CLL	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG	0.531822	457775.0	cw154_CutSmart_proteinase_K	583693.0	1097534.0	CLL
3	0.391057	5.340834	4.0	CLL	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC	0.579616	698909.0	cw154_CutSmart_proteinase_K	1035906.0	1787229.0	CLL
4	0.389722	5.459580	5.0	CLL	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG	0.571888	822828.0	cw154_CutSmart_proteinase_K	1207438.0	2111320.0	CLL

	sample	totCpG	bsRate
0	RRBS_trito_pool_1_TAAGGCGA.ACAACC	995714	0.980280
1	RRBS_trito_pool_1_TAAGGCGA.ACGTGG	705787	0.980081
2	RRBS_trito_pool_1_TAAGGCGA.ACTCAC	865744	0.980305
3	RRBS_trito_pool_1_TAAGGCGA.AGGATG	955160	0.980392
4	RRBS_trito_pool_1_TAAGGCGA.ATAGCG	634455	0.980256

	type	avgReadCpGs_mean	Unique_CpGs	bsRate
0	normal	5.295301	178825	0.959657
1	normal	5.285714	86434	0.958634
2	normal	5.453122	186115	0.958881
4	normal	5.366276	289150	0.958404
5	normal	5.269581	169511	0.959474

Dep. Variable:	PDR	R-squared:	0.609
Model:	OLS	Adj. R-squared:	0.605
Method:	Least Squares	F-statistic:	137.2
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	1.61e-70
Time:	10:53:46	Log-Likelihood:	505.03
No. Observations:	357	AIC:	-1000.
Df Residuals:	352	BIC:	-980.7
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	4.1014	0.237	17.305	0.000	3.635 4.568
avgReadCpGs_mean	-0.1167	0.024	-4.868	0.000	-0.164 -0.070
Unique_CpGs	7.763e-09	1.32e-08	0.587	0.557	-1.82e-08 3.38e-08
bsRate	-3.1729	0.200	-15.836	0.000	-3.567 -2.779
type_normal	-0.0727	0.008	-9.302	0.000	-0.088 -0.057

Omnibus:	53.943	Durbin-Watson:	1.128
Prob(Omnibus):	0.000	Jarque-Bera (JB):	14.655
Skew:	0.150	Prob(JB):	0.000657
Kurtosis:	2.054	Cond. No.	5.41e+07

Dep. Variable:	PDR	R-squared:	0.583
Model:	OLS	Adj. R-squared:	0.581
Method:	Least Squares	F-statistic:	247.4
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	5.94e-68
Time:	12:33:09	Log-Likelihood:	493.40
No. Observations:	357	AIC:	-980.8
Df Residuals:	354	BIC:	-969.2
Df Model:	2
Covariance Type:	nonrobust

Omnibus:	57.549	Durbin-Watson:	1.112
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.257
Skew:	0.213	Prob(JB):	0.000295
Kurtosis:	2.045	Cond. No.	127.

	PDR	total_reads	avgReadCpGs_mean	Unique_CpGs	bsRate
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACGTGG)	0.373183	2739741.0	5.461553	850750.5	0.980181
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ACTCAC)	0.370252	3253674.0	5.381947	930729.0	0.980293
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATAGCG)	0.376650	2776613.0	5.370726	815084.5	0.980268
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.ATCGAC)	0.373634	3420813.5	5.402035	1007351.0	0.980262
(RRBS_trito_pool_1_TAAGGCGA.ACAACC, RRBS_trito_pool_1_TAAGGCGA.CAAGAG)	0.374483	3369791.5	5.356506	995566.0	0.980228

filename	RRBS_trito_pool_1_TAAGGCGA.ACAACC	RRBS_trito_pool_1_TAAGGCGA.ACGTGG	RRBS_trito_pool_1_TAAGGCGA.ACTCAC	RRBS_trito_pool_1_TAAGGCGA.ATAGCG	RRBS_trito_pool_1_TAAGGCGA.ATCGAC	RRBS_trito_pool_1_TAAGGCGA.CAAGAG	RRBS_trito_pool_1_TAAGGCGA.CATGAC	RRBS_trito_pool_1_TAAGGCGA.CCTTCG	RRBS_trito_pool_1_TAAGGCGA.CGGTAG	RRBS_trito_pool_1_TAAGGCGA.CTATTG	RRBS_trito_pool_1_TAAGGCGA.GACACG	RRBS_trito_pool_1_TAAGGCGA.GCATTC	RRBS_trito_pool_1_TAAGGCGA.GCTGCC	RRBS_trito_pool_1_TAAGGCGA.GGCATC	RRBS_trito_pool_1_TAAGGCGA.GTGAGG	RRBS_trito_pool_1_TAAGGCGA.GTTGAG	RRBS_trito_pool_1_TAAGGCGA.TAGCGG	RRBS_trito_pool_1_TAAGGCGA.TATCTC	RRBS_trito_pool_1_TAAGGCGA.TCTCTG	RRBS_trito_pool_1_TAAGGCGA.TGACAG	RRBS_trito_pool_1_TAAGGCGA.TGCTGC
filename
RRBS_trito_pool_1_TAAGGCGA.ACAACC	0.000000	0.009691	0.003830	0.016626	0.010594	0.012291	0.003587	0.008767	0.018985	0.006822	0.006988	0.006994	0.024748	0.016339	0.011690	0.009904	0.016987	0.002218	0.003893	0.010018	0.018234
RRBS_trito_pool_1_TAAGGCGA.ACGTGG	0.009691	0.000000	0.005861	0.006935	0.000903	0.002601	0.006104	0.000924	0.009294	0.002868	0.002703	0.002697	0.015058	0.006649	0.001999	0.000213	0.007296	0.007473	0.005798	0.000327	0.008543
RRBS_trito_pool_1_TAAGGCGA.ACTCAC	0.003830	0.005861	0.000000	0.012796	0.006764	0.008461	0.000243	0.004937	0.015155	0.002992	0.003158	0.003164	0.020918	0.012509	0.007860	0.006074	0.013157	0.001612	0.000063	0.006188	0.014404
RRBS_trito_pool_1_TAAGGCGA.ATAGCG	0.016626	0.006935	0.012796	0.000000	0.006032	0.004335	0.013040	0.007859	0.002359	0.009804	0.009638	0.009632	0.008122	0.000287	0.004937	0.006722	0.000360	0.014408	0.012733	0.006608	0.001608
RRBS_trito_pool_1_TAAGGCGA.ATCGAC	0.010594	0.000903	0.006764	0.006032	0.000000	0.001698	0.007007	0.001827	0.008391	0.003772	0.003606	0.003600	0.014155	0.005746	0.001096	0.000690	0.006393	0.008376	0.006701	0.000576	0.007640

Dep. Variable:	PDR_difference	R-squared:	0.120
Model:	OLS	Adj. R-squared:	0.107
Method:	Least Squares	F-statistic:	9.359
Date:	Wed, 06 Jul 2016	Prob (F-statistic):	7.92e-06
Time:	13:15:01	Log-Likelihood:	819.55
No. Observations:	210	AIC:	-1631.
Df Residuals:	206	BIC:	-1618.
Df Model:	3
Covariance Type:	nonrobust

Omnibus:	9.333	Durbin-Watson:	1.670
Prob(Omnibus):	0.009	Jarque-Bera (JB):	9.295
Skew:	0.475	Prob(JB):	0.00959
Kurtosis:	2.599	Cond. No.	1.72e+10

	filename	PDR_difference
0	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.000000
1	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.021444
2	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.025515
3	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.000277
4	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.012904

	PDR	total_reads_mean	avgReadCpGs_mean	Unique_CpGs_mean	bsRate_mean	filename	PDR_difference
0	0.424997	1287881.0	5.522677	336578.0	0.961075	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.021444
1	0.427033	1363167.0	5.542956	339940.0	0.961160	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.025515
2	0.414414	2138379.5	5.346094	518622.0	0.961080	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.000277
3	0.420727	1798187.5	5.599726	425069.0	0.961269	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.012904
4	0.415530	1772492.0	5.529529	429952.0	0.961256	(RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC, RRB...	0.002509

Omnibus:	15.543	Durbin-Watson:	1.485
Prob(Omnibus):	0.000	Jarque-Bera (JB):	17.018
Skew:	0.690	Prob(JB):	0.000202
Kurtosis:	3.198	Cond. No.	3.92e+09

Omnibus:	26.931	Durbin-Watson:	1.515
Prob(Omnibus):	0.000	Jarque-Bera (JB):	32.939
Skew:	0.912	Prob(JB):	7.04e-08
Kurtosis:	3.308	Cond. No.	1.28e+10

Omnibus:	74.640	Durbin-Watson:	0.319
Prob(Omnibus):	0.000	Jarque-Bera (JB):	185.034
Skew:	1.960	Prob(JB):	6.61e-41
Kurtosis:	6.255	Cond. No.	1.91e+10

Omnibus:	279.133	Durbin-Watson:	1.091
Prob(Omnibus):	0.000	Jarque-Bera (JB):	861.832
Skew:	1.676	Prob(JB):	7.17e-188
Kurtosis:	6.733	Cond. No.	2.20e+08