notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# We need a regression model for predicting methylation (highest priority): 
# Variables are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting methylation?



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.api as sm
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')



In [4]:

    
normal_cellA_df = pd.read_csv("unweighted_PDR_methyl_RRBS_normal_B.csv")
normal_cellB_df = pd.read_csv("unweighted_PDR_methyl_CD27pcell.csv") 
normal_cellC_df = pd.read_csv("unweighted_PDR_methyl_CD27mcell.csv")
cll_cellA_df = pd.read_csv("unweighted_PDR_methyl_RRBS_cw154.csv")
cll_cellC_df = pd.read_csv("unweighted_PDR_methyl_RRBS_trito_pool.csv")



In [5]:

    
print(normal_cellA_df.shape)
print(normal_cellB_df.shape) 
print(normal_cellC_df.shape)
print(cll_cellA_df.shape)
print(cll_cellC_df.shape)

print(normal_cellA_df.head())
print(normal_cellB_df.head())
normal_cellC_df.head()
cll_cellA_df.head()
cll_cellC_df.head()









    



(126, 5)
(90, 5)
(88, 5)
(66, 5)
(44, 5)
   Unnamed: 0                                           filename  \
0           0  RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC.dan.a...   
1           1  RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG.dan.a...   
2           2  RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG.dan.a...   
3           3  RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC.dan.a...   
4           4  RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG.dan.a...   

   PDR_unweighted  methylation_unweighted  total_reads  
0        0.254835                0.691996   11894660.0  
1        0.390562                0.620106    3744659.0  
2        0.266418                0.699736   10461874.0  
3        0.265385                0.763173      14051.0  
4        0.240201                0.732036   21928743.0  
   Unnamed: 0                                           filename  \
0           0  RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC...   
1           1  RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG...   
2           2  RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG...   
3           3  RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC...   
4           4  RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG...   

   PDR_unweighted  methylation_unweighted  total_reads  
0        0.336813                0.607640    5781769.0  
1        0.317133                0.525005     901283.0  
2        0.283056                0.584993       2640.0  
3        0.245889                0.616733    6036855.0  
4        0.330910                0.517017    3210282.0  






    Out[5]:






  
    
      
      Unnamed: 0
      filename
      PDR_unweighted
      methylation_unweighted
      total_reads
    
  
  
    
      0
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC.dan.annoRRBS...
      0.357578
      0.648881
      21089805.0
    
    
      1
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG.dan.annoRRBS...
      0.358350
      0.642320
      12424887.0
    
    
      2
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC.dan.annoRRBS...
      0.353506
      0.649909
      18443935.0
    
    
      3
      3
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG.dan.annoRRBS...
      0.357433
      0.635992
      12731278.0
    
    
      4
      4
      RRBS_trito_pool_1_TAAGGCGA.ATCGAC.dan.annoRRBS...
      0.359219
      0.647797
      20736297.0



In [6]:

    
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)  
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]



In [7]:

    
normal_cellA_df.head()









    Out[7]:






  
    
      
      filename
      PDR_unweighted
      methylation_unweighted
      total_reads
      type
      bio
      protocol
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.254835
      0.691996
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.390562
      0.620106
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.266418
      0.699736
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.265385
      0.763173
      14051.0
      normal
      normal_B
      normal_B_cell_A1_24
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.240201
      0.732036
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24



In [8]:

    
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)  
cpg1["filename"] = cpg1["filename"].str[:40]
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')
normal_cellB_df["type"] = str('normal')
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1) 
normal_cellB_df["bio"] = str('CD27p')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')
cpg2 = pd.read_csv('NormalBCD19pCD27pcell_CpGs.csv')
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)  
cpg2["filename"] = cpg2["filename"].str[:50]
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)  
normal_cellC_df["type"] = str('normal')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]
normal_cellC_df["bio"] = str('CD27m')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')
cpg3 = pd.read_csv("NormalBCD19pCD27mcell_CpGs.csv")
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)  
cpg3["filename"] = cpg3["filename"].str[:50]
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
print(normal_result.shape)
print(normal_result.columns)









    



(304, 9)
Index(['PDR_unweighted', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation_unweighted', 'protocol', 'total_reads',
       'type'],
      dtype='object')



In [9]:

    
normal_result = normal_result[['filename', 'methylation_unweighted', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]



In [10]:

    
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1) 
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 
        'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)  
cpg4["filename"] = cpg4["filename"].str[:51]
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1) 
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1) 
cpg5["filename"] = cpg5["filename"].str[:33]
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
print(cll_result.shape)
print(cll_result.columns)









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



(110, 9)
Index(['PDR_unweighted', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation_unweighted', 'protocol', 'total_reads',
       'type'],
      dtype='object')



In [11]:

    
cll_result = cll_result[['filename', 'methylation_unweighted', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)
combined2.head()









    Out[11]:






  
    
      
      filename
      methylation_unweighted
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.691996
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.620106
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.699736
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.763173
      14051.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.732036
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0



In [12]:

    
bs = pd.read_table('allStats.txt')
bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)
bs = bs.reset_index(drop=True)
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)
bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)
bs = bs.rename(columns = {'sample':'filename'})



In [13]:

    
merged = pd.merge(combined2, bs, how='inner')
merged = merged.reset_index(drop=True)



In [14]:

    
merged.shape









    Out[14]:





(414, 10)



In [15]:

    
merged = pd.merge(combined2, bs, how='inner')
merged = merged.reset_index(drop=True)
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})
# Remove all data points with less than 100k in totcpg 
merged = merged[merged['total_reads'] > 100000]
scattermatrix1 = merged.drop(['filename', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)
sns.lmplot(x="bsRate", y="methylation_unweighted",  data=scattermatrix1)
plt.title("methylation vs bisulfite conversion rate")









    Out[15]:





<matplotlib.text.Text at 0x102c8a9e8>



In [16]:

    
combined2.head()









    Out[16]:






  
    
      
      filename
      methylation_unweighted
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.691996
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.620106
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.699736
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.763173
      14051.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.732036
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0



In [17]:

    
merged.head()









    Out[17]:






  
    
      
      filename
      methylation_unweighted
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
      Unique_CpGs
      bsRate
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.691996
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
      178825
      0.959657
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.620106
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
      86434
      0.958634
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.699736
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
      186115
      0.958881
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.732036
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0
      289150
      0.958404
    
    
      5
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ATAGCG
      0.648127
      10864882.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.269581
      4.0
      169511
      0.959474



In [18]:

    
bs.head()









    Out[18]:






  
    
      
      filename
      totCpG
      bsRate
    
  
  
    
      0
      RRBS_trito_pool_1_TAAGGCGA.ACAACC
      995714
      0.980280
    
    
      1
      RRBS_trito_pool_1_TAAGGCGA.ACGTGG
      705787
      0.980081
    
    
      2
      RRBS_trito_pool_1_TAAGGCGA.ACTCAC
      865744
      0.980305
    
    
      3
      RRBS_trito_pool_1_TAAGGCGA.AGGATG
      955160
      0.980392
    
    
      4
      RRBS_trito_pool_1_TAAGGCGA.ATAGCG
      634455
      0.980256



In [19]:

    
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)
y = scattermatrix7.methylation_unweighted # dependent variable
print(y.shape)
X = scattermatrix7.drop(['methylation_unweighted', 'total_reads', 'protocol'], axis=1)
print(X.shape)

categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
import statsmodels.api as sm
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict methylation")
print("Variables: Number of unique CpGs per cell, mean Average Read CpG per cell, BS rate, Cll or Normal B")
est.summary()









    



(358,)
(358, 4)
Regression results for CLL 'RRBS_trito_pool1', predict methylation
Variables: Number of unique CpGs per cell, mean Average Read CpG per cell, BS rate, Cll or Normal B






    Out[19]:





OLS Regression Results

  Dep. Variable:     methylation_unweighted    R-squared:             0.457


  Model:                       OLS             Adj. R-squared:        0.451


  Method:                 Least Squares        F-statistic:           74.21


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  1.36e-45


  Time:                     16:04:53           Log-Likelihood:       746.26


  No. Observations:             358            AIC:                  -1483.


  Df Residuals:                 353            BIC:                  -1463.


  Df Model:                       4                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 1.4335      0.081     17.720   0.000      1.274     1.593


  avgReadCpGs_mean     -0.0471      0.012     -3.851   0.000     -0.071    -0.023


  Unique_CpGs        5.801e-08    6.7e-09      8.653   0.000   4.48e-08  7.12e-08


  bsRate               -1.3260      0.102    -12.938   0.000     -1.528    -1.124


  type_CLL              0.7097      0.040     17.715   0.000      0.631     0.789


  type_normal           0.7238      0.041     17.685   0.000      0.643     0.804




  Omnibus:         4.079    Durbin-Watson:         1.171


  Prob(Omnibus):   0.130    Jarque-Bera (JB):      3.851


  Skew:           -0.247    Prob(JB):              0.146


  Kurtosis:        3.116    Cond. No.           6.18e+20



In [20]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2))
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation_unweighted, tritopool.methylation_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs = pd.merge(out, methylation_differences, how='inner')
print(pairs.shape)
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs.methylation_difference # dependent variable
X = pairs.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
est.summary()









    



(210, 7)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation






    Out[20]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.003


  Model:                       OLS             Adj. R-squared:       -0.011


  Method:                 Least Squares        F-statistic:          0.2122


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):    0.888 


  Time:                     16:04:53           Log-Likelihood:       800.81


  No. Observations:             210            AIC:                  -1594.


  Df Residuals:                 206            BIC:                  -1580.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -2.1710      5.173     -0.420   0.675    -12.370     8.028


  avgReadCpGs_mean      0.0049      0.007      0.700   0.484     -0.009     0.019


  Unique_CpGs_mean  -2.656e-09   4.88e-09     -0.545   0.587  -1.23e-08  6.96e-09


  bsRate_mean           2.1980      5.272      0.417   0.677     -8.196    12.592




  Omnibus:        12.077    Durbin-Watson:         2.275


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     11.720


  Skew:            0.527    Prob(JB):            0.00285


  Kurtosis:        2.521    Cond. No.           1.72e+10



In [21]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x1075d7588>



In [22]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x10775f7f0>



In [23]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x1082c99b0>



In [24]:

    
tritopool2 = merged[merged["protocol"] == 'trito_pool_2']
tritopool2 = tritopool2.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopool2A = tritopool2.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool2.filename,2))
out = pd.DataFrame([tritopool2A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool2.methylation_unweighted, tritopool2.methylation_unweighted)), tritopool2.filename, tritopool2.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs1a.methylation_difference # dependent variable
X = pairs1a.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool_2', predict \delta methylation")
est.summary()









    



(253, 7)
(253,)
(253, 3)
Regression results for CLL 'RRBS_trito_pool_2', predict \delta methylation






    Out[24]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.036


  Model:                       OLS             Adj. R-squared:        0.025


  Method:                 Least Squares        F-statistic:           3.124


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.0265 


  Time:                     16:04:56           Log-Likelihood:       934.54


  No. Observations:             253            AIC:                  -1861.


  Df Residuals:                 249            BIC:                  -1847.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 3.8501      3.720      1.035   0.302     -3.476    11.177


  avgReadCpGs_mean      0.0215      0.007      2.882   0.004      0.007     0.036


  Unique_CpGs_mean  -4.776e-09   3.09e-09     -1.545   0.124  -1.09e-08  1.31e-09


  bsRate_mean          -4.0532      3.830     -1.058   0.291    -11.597     3.490




  Omnibus:        27.220    Durbin-Watson:         1.512


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     32.640


  Skew:            0.854    Prob(JB):           8.17e-08


  Kurtosis:        3.425    Cond. No.           1.24e+10



In [25]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[25]:





<matplotlib.text.Text at 0x10845e5c0>



In [26]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x108680588>



In [27]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x1087fe0b8>



In [28]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2.methylation_difference # dependent variable
X = pairs2.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
est.summary()









    



Regression results for CLL 'cw154_Tris_protease', predict \delta methylation






    Out[28]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.183


  Model:                       OLS             Adj. R-squared:        0.171


  Method:                 Least Squares        F-statistic:           15.33


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  4.83e-09


  Time:                     16:04:59           Log-Likelihood:       530.18


  No. Observations:             210            AIC:                  -1052.


  Df Residuals:                 206            BIC:                  -1039.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                13.2802      8.774      1.514   0.132     -4.018    30.579


  avgReadCpGs_mean      0.0208      0.018      1.179   0.240     -0.014     0.056


  Unique_CpGs_mean  -6.604e-08   1.25e-08     -5.274   0.000  -9.07e-08 -4.14e-08


  bsRate_mean         -13.8821      9.183     -1.512   0.132    -31.988     4.223




  Omnibus:        10.871    Durbin-Watson:         1.441


  Prob(Omnibus):   0.004    Jarque-Bera (JB):     11.619


  Skew:            0.576    Prob(JB):            0.00300


  Kurtosis:        2.945    Cond. No.           3.92e+09



In [29]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x108a4eba8>



In [30]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x108cbb9e8>



In [31]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x108fcd908>



In [32]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2a.methylation_difference # dependent variable
X = pairs2a.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_cw154_Tris_protease_GR', predict \delta methylation")
est.summary()









    



Regression results for CLL 'RRBS_cw154_Tris_protease_GR', predict \delta methylation






    Out[32]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.137


  Model:                       OLS             Adj. R-squared:        0.125


  Method:                 Least Squares        F-statistic:           10.91


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  1.11e-06


  Time:                     16:05:02           Log-Likelihood:       560.32


  No. Observations:             210            AIC:                  -1113.


  Df Residuals:                 206            BIC:                  -1099.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -23.4595      7.821     -2.999   0.003    -38.879    -8.040


  avgReadCpGs_mean      0.0184      0.013      1.463   0.145     -0.006     0.043


  Unique_CpGs_mean  -3.898e-08   8.91e-09     -4.375   0.000  -5.65e-08 -2.14e-08


  bsRate_mean          24.3577      8.177      2.979   0.003      8.236    40.480




  Omnibus:         5.666    Durbin-Watson:         2.121


  Prob(Omnibus):   0.059    Jarque-Bera (JB):      4.628


  Skew:            0.266    Prob(JB):             0.0988


  Kurtosis:        2.504    Cond. No.           5.05e+09



In [33]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x10904a128>



In [34]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x109479da0>



In [35]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x1096d1ac8>



In [36]:

    
# cw154_CutSmart_proteinase_K
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2b.methylation_difference # dependent variable
X = pairs2b.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
est.summary()









    



Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation






    Out[36]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.452


  Model:                       OLS             Adj. R-squared:        0.444


  Method:                 Least Squares        F-statistic:           56.59


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  1.00e-26


  Time:                     16:05:05           Log-Likelihood:       649.84


  No. Observations:             210            AIC:                  -1292.


  Df Residuals:                 206            BIC:                  -1278.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -35.4398      4.078     -8.690   0.000    -43.480   -27.400


  avgReadCpGs_mean      0.0254      0.017      1.481   0.140     -0.008     0.059


  Unique_CpGs_mean  -5.047e-08   6.48e-09     -7.789   0.000  -6.32e-08 -3.77e-08


  bsRate_mean          36.6662      4.229      8.670   0.000     28.328    45.004




  Omnibus:         1.275    Durbin-Watson:         1.338


  Prob(Omnibus):   0.529    Jarque-Bera (JB):      1.320


  Skew:            0.121    Prob(JB):              0.517


  Kurtosis:        2.696    Cond. No.           5.01e+09



In [37]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x109954438>



In [38]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x109ba9240>



In [39]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x109e05cf8>



In [40]:

    
merged









    Out[40]:






  
    
      
      filename
      methylation_unweighted
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
      Unique_CpGs
      bsRate
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.691996
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
      178825
      0.959657
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.620106
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
      86434
      0.958634
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.699736
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
      186115
      0.958881
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.732036
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0
      289150
      0.958404
    
    
      5
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ATAGCG
      0.648127
      10864882.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.269581
      4.0
      169511
      0.959474
    
    
      6
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ATCGAC
      0.716552
      18977710.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.290053
      4.0
      277430
      0.958963
    
    
      7
      RRBS_normal_B_cell_A1_24_TAAGGCGA.CAAGAG
      0.670718
      15806813.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.337565
      4.0
      237047
      0.958413
    
    
      8
      RRBS_normal_B_cell_A1_24_TAAGGCGA.CATGAC
      0.668592
      18369519.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.232376
      4.0
      285468
      0.959137
    
    
      10
      RRBS_normal_B_cell_A1_24_TAAGGCGA.CGGTAG
      0.701418
      11929945.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.234528
      4.0
      195729
      0.958538
    
    
      11
      RRBS_normal_B_cell_A1_24_TAAGGCGA.CTATTG
      0.650146
      22563380.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.392137
      4.0
      294801
      0.959029
    
    
      12
      RRBS_normal_B_cell_A1_24_TAAGGCGA.CTCAGC
      0.640678
      11977642.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.293057
      4.0
      206210
      0.958260
    
    
      13
      RRBS_normal_B_cell_A1_24_TAAGGCGA.GACACG
      0.705313
      9335558.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.353557
      4.0
      173890
      0.958764
    
    
      15
      RRBS_normal_B_cell_A1_24_TAAGGCGA.GCTGCC
      0.696495
      8774558.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.273738
      4.0
      232913
      0.958182
    
    
      16
      RRBS_normal_B_cell_A1_24_TAAGGCGA.GGCATC
      0.688324
      10917289.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.271882
      4.0
      205672
      0.958583
    
    
      17
      RRBS_normal_B_cell_A1_24_TAAGGCGA.GTGAGG
      0.692209
      12029088.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.353795
      4.0
      207190
      0.958793
    
    
      18
      RRBS_normal_B_cell_A1_24_TAAGGCGA.GTTGAG
      0.645717
      11774403.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.069151
      4.0
      183150
      0.958447
    
    
      19
      RRBS_normal_B_cell_A1_24_TAAGGCGA.TAGCGG
      0.671286
      9235563.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.244287
      4.0
      172272
      0.958723
    
    
      20
      RRBS_normal_B_cell_A1_24_TAAGGCGA.TATCTC
      0.663549
      20653531.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.305180
      4.0
      286542
      0.959223
    
    
      21
      RRBS_normal_B_cell_A1_24_TAAGGCGA.TCTCTG
      0.621230
      16713780.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.300177
      4.0
      255722
      0.959102
    
    
      22
      RRBS_normal_B_cell_A1_24_TAAGGCGA.TGACAG
      0.670456
      14446829.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.371434
      4.0
      213850
      0.958745
    
    
      24
      RRBS_normal_B_cell_B1_24_CGTACTAG.ACAACC
      0.723172
      19018572.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.333422
      4.0
      607869
      0.961356
    
    
      25
      RRBS_normal_B_cell_B1_24_CGTACTAG.ACCGCG
      0.620718
      4245126.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.225119
      4.0
      211560
      0.961111
    
    
      27
      RRBS_normal_B_cell_B1_24_CGTACTAG.ACTCAC
      0.731753
      21361140.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.375833
      4.0
      649326
      0.960867
    
    
      29
      RRBS_normal_B_cell_B1_24_CGTACTAG.ATAGCG
      0.710628
      15217237.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.428571
      4.0
      507904
      0.961387
    
    
      31
      RRBS_normal_B_cell_B1_24_CGTACTAG.CAAGAG
      0.649119
      18225536.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.382511
      4.0
      597716
      0.961228
    
    
      32
      RRBS_normal_B_cell_B1_24_CGTACTAG.CATGAC
      0.726315
      17508416.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.373838
      4.0
      610746
      0.961153
    
    
      33
      RRBS_normal_B_cell_B1_24_CGTACTAG.CCTTCG
      0.698844
      10825670.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.246197
      4.0
      390623
      0.960858
    
    
      34
      RRBS_normal_B_cell_B1_24_CGTACTAG.CGGTAG
      0.609985
      11106696.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.287185
      4.0
      404019
      0.961128
    
    
      35
      RRBS_normal_B_cell_B1_24_CGTACTAG.CTATTG
      0.660035
      26046806.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.376118
      4.0
      726222
      0.961322
    
    
      36
      RRBS_normal_B_cell_B1_24_CGTACTAG.CTCAGC
      0.718351
      10352359.0
      normal
      normal_B
      normal_B_cell_B1_24
      5.391431
      4.0
      405492
      0.960827
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      384
      RRBS_trito_pool_1_TAAGGCGA.GTGAGG
      0.630143
      14375456.0
      CLL
      CLL
      trito_pool_1
      5.434248
      4.0
      832135
      0.980150
    
    
      385
      RRBS_trito_pool_1_TAAGGCGA.GTTGAG
      0.651666
      17882967.0
      CLL
      CLL
      trito_pool_1
      5.460279
      4.0
      919081
      0.980045
    
    
      386
      RRBS_trito_pool_1_TAAGGCGA.TAGCGG
      0.635522
      11903058.0
      CLL
      CLL
      trito_pool_1
      5.578051
      5.0
      750879
      0.980104
    
    
      387
      RRBS_trito_pool_1_TAAGGCGA.TATCTC
      0.640908
      22347634.0
      CLL
      CLL
      trito_pool_1
      5.385500
      4.0
      1033019
      0.980429
    
    
      388
      RRBS_trito_pool_1_TAAGGCGA.TCTCTG
      0.649959
      19310635.0
      CLL
      CLL
      trito_pool_1
      5.507866
      5.0
      983183
      0.980216
    
    
      389
      RRBS_trito_pool_1_TAAGGCGA.TGACAG
      0.651817
      17903364.0
      CLL
      CLL
      trito_pool_1
      5.445355
      5.0
      911433
      0.980182
    
    
      390
      RRBS_trito_pool_1_TAAGGCGA.TGCTGC
      0.638604
      10782400.0
      CLL
      CLL
      trito_pool_1
      5.436130
      4.0
      758364
      0.979957
    
    
      391
      RRBS_trito_pool_2_CGTACTAG.ACAACC
      0.655123
      14605288.0
      CLL
      CLL
      trito_pool_2
      5.456495
      5.0
      989756
      0.975723
    
    
      392
      RRBS_trito_pool_2_CGTACTAG.ACGTGG
      0.648626
      8881971.0
      CLL
      CLL
      trito_pool_2
      5.396058
      4.0
      719008
      0.975727
    
    
      393
      RRBS_trito_pool_2_CGTACTAG.ACTCAC
      0.652962
      13745701.0
      CLL
      CLL
      trito_pool_2
      5.573111
      5.0
      900261
      0.976011
    
    
      394
      RRBS_trito_pool_2_CGTACTAG.AGGATG
      0.658358
      15488890.0
      CLL
      CLL
      trito_pool_2
      5.425277
      4.0
      1038209
      0.975797
    
    
      395
      RRBS_trito_pool_2_CGTACTAG.ATAGCG
      0.635741
      8881783.0
      CLL
      CLL
      trito_pool_2
      5.583708
      5.0
      625586
      0.976017
    
    
      396
      RRBS_trito_pool_2_CGTACTAG.ATCGAC
      0.661048
      14625731.0
      CLL
      CLL
      trito_pool_2
      5.386750
      4.0
      1018364
      0.975815
    
    
      397
      RRBS_trito_pool_2_CGTACTAG.CAAGAG
      0.649029
      12469340.0
      CLL
      CLL
      trito_pool_2
      5.488154
      4.0
      888020
      0.975747
    
    
      398
      RRBS_trito_pool_2_CGTACTAG.CATGAC
      0.659853
      13835873.0
      CLL
      CLL
      trito_pool_2
      5.389775
      4.0
      994327
      0.975796
    
    
      399
      RRBS_trito_pool_2_CGTACTAG.CCCGGG
      0.633890
      8053245.0
      CLL
      CLL
      trito_pool_2
      5.434495
      4.0
      835260
      0.975402
    
    
      400
      RRBS_trito_pool_2_CGTACTAG.CCTTCG
      0.647402
      8881050.0
      CLL
      CLL
      trito_pool_2
      5.396002
      4.0
      704234
      0.975621
    
    
      401
      RRBS_trito_pool_2_CGTACTAG.CGGTAG
      0.645339
      9945534.0
      CLL
      CLL
      trito_pool_2
      5.393190
      4.0
      769964
      0.975843
    
    
      402
      RRBS_trito_pool_2_CGTACTAG.CTATTG
      0.659609
      17521456.0
      CLL
      CLL
      trito_pool_2
      5.481741
      4.0
      1067682
      0.975847
    
    
      403
      RRBS_trito_pool_2_CGTACTAG.CTCAGC
      0.647401
      19116083.0
      CLL
      CLL
      trito_pool_2
      5.490711
      4.0
      1403284
      0.975506
    
    
      404
      RRBS_trito_pool_2_CGTACTAG.GACACG
      0.645343
      7390281.0
      CLL
      CLL
      trito_pool_2
      5.359332
      4.0
      657912
      0.975698
    
    
      405
      RRBS_trito_pool_2_CGTACTAG.GCATTC
      0.656557
      12974222.0
      CLL
      CLL
      trito_pool_2
      5.496919
      4.0
      945183
      0.975748
    
    
      406
      RRBS_trito_pool_2_CGTACTAG.GCTGCC
      0.650519
      4336065.0
      CLL
      CLL
      trito_pool_2
      5.238913
      4.0
      533549
      0.975409
    
    
      407
      RRBS_trito_pool_2_CGTACTAG.GGCATC
      0.648295
      9349697.0
      CLL
      CLL
      trito_pool_2
      5.413434
      4.0
      827303
      0.975700
    
    
      408
      RRBS_trito_pool_2_CGTACTAG.GTGAGG
      0.655986
      9733080.0
      CLL
      CLL
      trito_pool_2
      5.425350
      4.0
      812656
      0.975756
    
    
      409
      RRBS_trito_pool_2_CGTACTAG.GTTGAG
      0.659633
      12146937.0
      CLL
      CLL
      trito_pool_2
      5.515359
      5.0
      889990
      0.975847
    
    
      410
      RRBS_trito_pool_2_CGTACTAG.TAGCGG
      0.650606
      7262356.0
      CLL
      CLL
      trito_pool_2
      5.573582
      5.0
      660936
      0.975576
    
    
      411
      RRBS_trito_pool_2_CGTACTAG.TATCTC
      0.647639
      16347740.0
      CLL
      CLL
      trito_pool_2
      5.520034
      5.0
      1054358
      0.976017
    
    
      412
      RRBS_trito_pool_2_CGTACTAG.TCTCTG
      0.651297
      12749376.0
      CLL
      CLL
      trito_pool_2
      5.291110
      4.0
      924822
      0.975755
    
    
      413
      RRBS_trito_pool_2_CGTACTAG.TGACAG
      0.654169
      11829563.0
      CLL
      CLL
      trito_pool_2
      5.389180
      4.0
      871978
      0.975783
    
  

358 rows × 10 columns



In [41]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations  
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3.methylation_difference # dependent variable
X = pairs3.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell1_22', predict \delta methylation")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell1_22', predict \delta methylation






    Out[41]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.101


  Model:                       OLS             Adj. R-squared:        0.083


  Method:                 Least Squares        F-statistic:           5.609


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.00114


  Time:                     16:05:09           Log-Likelihood:       341.41


  No. Observations:             153            AIC:                  -674.8


  Df Residuals:                 149            BIC:                  -662.7


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const              -116.4361     37.123     -3.137   0.002   -189.791   -43.081


  avgReadCpGs_mean      0.0795      0.035      2.263   0.025      0.010     0.149


  Unique_CpGs_mean  -1.209e-08   2.78e-08     -0.435   0.664  -6.71e-08  4.29e-08


  bsRate_mean         116.3615     37.212      3.127   0.002     42.831   189.892




  Omnibus:         8.971    Durbin-Watson:         1.528


  Prob(Omnibus):   0.011    Jarque-Bera (JB):      9.321


  Skew:            0.604    Prob(JB):            0.00946


  Kurtosis:        3.024    Cond. No.           9.28e+09



In [42]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x10a13fb70>



In [43]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x10a4461d0>



In [44]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[44]:





<matplotlib.text.Text at 0x10a671b38>



In [45]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3a.methylation_difference # dependent variable
X = pairs3a.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell22_34', predict \delta methylation")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell22_34', predict \delta methylation






    Out[45]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.106


  Model:                       OLS             Adj. R-squared:        0.094


  Method:                 Least Squares        F-statistic:           8.932


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  1.28e-05


  Time:                     16:05:12           Log-Likelihood:       550.03


  No. Observations:             231            AIC:                  -1092.


  Df Residuals:                 227            BIC:                  -1078.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -49.4994     37.450     -1.322   0.188   -123.293    24.294


  avgReadCpGs_mean      0.0575      0.019      3.007   0.003      0.020     0.095


  Unique_CpGs_mean  -2.296e-08   2.27e-08     -1.009   0.314  -6.78e-08  2.19e-08


  bsRate_mean          49.3532     37.520      1.315   0.190    -24.578   123.285




  Omnibus:        10.109    Durbin-Watson:         1.829


  Prob(Omnibus):   0.006    Jarque-Bera (JB):     10.757


  Skew:            0.524    Prob(JB):            0.00461


  Kurtosis:        2.863    Cond. No.           1.28e+10



In [46]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x10a8d09b0>



In [47]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x10ab3c438>



In [48]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[48]:





<matplotlib.text.Text at 0x10ae408d0>



In [49]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3b.methylation_difference # dependent variable
X = pairs3b.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell45_66', predict \delta methylation")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell45_66', predict \delta methylation






    Out[49]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.385


  Model:                       OLS             Adj. R-squared:        0.327


  Method:                 Least Squares        F-statistic:           6.668


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.00127


  Time:                     16:05:14           Log-Likelihood:       96.760


  No. Observations:              36            AIC:                  -185.5


  Df Residuals:                  32            BIC:                  -179.2


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               287.2760    124.543      2.307   0.028     33.590   540.962


  avgReadCpGs_mean      0.0375      0.043      0.881   0.385     -0.049     0.124


  Unique_CpGs_mean   4.071e-09   1.76e-08      0.231   0.819  -3.19e-08     4e-08


  bsRate_mean        -288.1651    124.678     -2.311   0.027   -542.126   -34.204




  Omnibus:         1.435    Durbin-Watson:         2.482


  Prob(Omnibus):   0.488    Jarque-Bera (JB):      1.359


  Skew:           -0.429    Prob(JB):              0.507


  Kurtosis:        2.590    Cond. No.           3.46e+10



In [50]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x10b0a6ef0>



In [51]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x10b2d9828>



In [52]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[52]:





<matplotlib.text.Text at 0x10b4df4e0>



In [53]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3c.methylation_difference # dependent variable
X = pairs3c.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell67_88', predict \delta methylation")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell67_88', predict \delta methylation






    Out[53]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.057


  Model:                       OLS             Adj. R-squared:        0.043


  Method:                 Least Squares        F-statistic:           4.142


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.00707


  Time:                     16:05:17           Log-Likelihood:       566.88


  No. Observations:             210            AIC:                  -1126.


  Df Residuals:                 206            BIC:                  -1112.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                81.7565     24.514      3.335   0.001     33.426   130.087


  avgReadCpGs_mean     -0.0398      0.026     -1.542   0.125     -0.091     0.011


  Unique_CpGs_mean   2.623e-08   1.44e-08      1.822   0.070  -2.15e-09  5.46e-08


  bsRate_mean         -81.7656     24.529     -3.333   0.001   -130.126   -33.405




  Omnibus:        26.255    Durbin-Watson:         0.981


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     32.164


  Skew:            0.937    Prob(JB):           1.04e-07


  Kurtosis:        3.404    Cond. No.           1.06e+10



In [54]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x10b80ba90>



In [55]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x10bb28da0>



In [56]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[56]:





<matplotlib.text.Text at 0x10be39550>



In [57]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4.methylation_difference # dependent variable
X = pairs4.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell1_22', predict \delta methylation")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell1_22', predict \delta methylation






    Out[57]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.459


  Model:                       OLS             Adj. R-squared:        0.448


  Method:                 Least Squares        F-statistic:           42.15


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  8.77e-20


  Time:                     16:05:20           Log-Likelihood:       385.37


  No. Observations:             153            AIC:                  -762.7


  Df Residuals:                 149            BIC:                  -750.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const              -148.0990     46.964     -3.153   0.002   -240.901   -55.297


  avgReadCpGs_mean      0.2332      0.028      8.201   0.000      0.177     0.289


  Unique_CpGs_mean   1.394e-07   2.65e-08      5.250   0.000   8.69e-08  1.92e-07


  bsRate_mean         147.2034     47.141      3.123   0.002     54.053   240.354




  Omnibus:        14.563    Durbin-Watson:         1.334


  Prob(Omnibus):   0.001    Jarque-Bera (JB):      7.216


  Skew:            0.330    Prob(JB):             0.0271


  Kurtosis:        2.165    Cond. No.           1.65e+10



In [58]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell1_22 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x10c0cc860>



In [59]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x10c3195f8>



In [60]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[60]:





<matplotlib.text.Text at 0x10c5e7908>



In [61]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4a.methylation_difference # dependent variable
X = pairs4a.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation






    Out[61]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.070


  Model:                       OLS             Adj. R-squared:        0.053


  Method:                 Least Squares        F-statistic:           4.192


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.00685


  Time:                     16:05:23           Log-Likelihood:       479.12


  No. Observations:             171            AIC:                  -950.2


  Df Residuals:                 167            BIC:                  -937.7


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -69.1096     43.629     -1.584   0.115   -155.244    17.025


  avgReadCpGs_mean     -0.1001      0.031     -3.206   0.002     -0.162    -0.038


  Unique_CpGs_mean  -4.984e-08   2.06e-08     -2.421   0.017  -9.05e-08 -9.19e-09


  bsRate_mean          69.8564     43.739      1.597   0.112    -16.497   156.210




  Omnibus:         9.337    Durbin-Watson:         1.399


  Prob(Omnibus):   0.009    Jarque-Bera (JB):      9.857


  Skew:            0.588    Prob(JB):            0.00724


  Kurtosis:        2.948    Cond. No.           1.91e+10



In [62]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell23_44 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x10c846588>



In [63]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x10ca9dba8>



In [64]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[64]:





<matplotlib.text.Text at 0x10cdc6860>



In [65]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4b.methylation_difference # dependent variable
X = pairs4b.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation






    Out[65]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.339


  Model:                       OLS             Adj. R-squared:        0.324


  Method:                 Least Squares        F-statistic:           22.53


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  7.69e-12


  Time:                     16:05:26           Log-Likelihood:       352.27


  No. Observations:             136            AIC:                  -696.5


  Df Residuals:                 132            BIC:                  -684.9


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                61.2886     47.409      1.293   0.198    -32.491   155.069


  avgReadCpGs_mean      0.2237      0.028      7.903   0.000      0.168     0.280


  Unique_CpGs_mean   1.393e-07   3.03e-08      4.595   0.000   7.93e-08  1.99e-07


  bsRate_mean         -62.6685     47.529     -1.319   0.190   -156.685    31.348




  Omnibus:         3.998    Durbin-Watson:         1.192


  Prob(Omnibus):   0.135    Jarque-Bera (JB):      3.913


  Skew:            0.414    Prob(JB):              0.141


  Kurtosis:        2.925    Cond. No.           1.55e+10



In [66]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell45_66 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x10d03a320>



In [67]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x10d34e2b0>



In [68]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[68]:





<matplotlib.text.Text at 0x10d681c18>



In [69]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4c.methylation_difference # dependent variable
X = pairs4c.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation






    Out[69]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.150


  Model:                       OLS             Adj. R-squared:        0.138


  Method:                 Least Squares        F-statistic:           12.11


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  2.47e-07


  Time:                     16:05:29           Log-Likelihood:       518.92


  No. Observations:             210            AIC:                  -1030.


  Df Residuals:                 206            BIC:                  -1016.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -5.4777     48.091     -0.114   0.909   -100.292    89.336


  avgReadCpGs_mean      0.1170      0.022      5.237   0.000      0.073     0.161


  Unique_CpGs_mean   4.107e-08   2.13e-08      1.929   0.055  -9.17e-10  8.31e-08


  bsRate_mean           4.8612     48.220      0.101   0.920    -90.207    99.929




  Omnibus:         7.984    Durbin-Watson:         1.882


  Prob(Omnibus):   0.018    Jarque-Bera (JB):      8.027


  Skew:            0.477    Prob(JB):             0.0181


  Kurtosis:        3.085    Cond. No.           2.04e+10



In [70]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell67_88 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x10d9adac8>



In [71]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x10dbdc470>



In [72]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[72]:





<matplotlib.text.Text at 0x10df66748>



In [73]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5.methylation_difference # dependent variable
X = pairs5.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_A1_24', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_A1_24', predict \delta methylation






    Out[73]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.009


  Model:                       OLS             Adj. R-squared:       -0.007


  Method:                 Least Squares        F-statistic:          0.5565


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):    0.644 


  Time:                     16:05:32           Log-Likelihood:       434.74


  No. Observations:             190            AIC:                  -861.5


  Df Residuals:                 186            BIC:                  -848.5


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 8.0254      6.609      1.214   0.226     -5.014    21.065


  avgReadCpGs_mean      0.0179      0.035      0.510   0.610     -0.051     0.087


  Unique_CpGs_mean  -1.006e-08   5.13e-08     -0.196   0.845  -1.11e-07  9.12e-08


  bsRate_mean          -8.4297      6.913     -1.219   0.224    -22.067     5.208




  Omnibus:        12.503    Durbin-Watson:         1.687


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     13.768


  Skew:            0.657    Prob(JB):            0.00102


  Kurtosis:        2.892    Cond. No.           1.17e+09



In [74]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x10e76ccc0>



In [75]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x10e9bbb00>



In [76]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[76]:





<matplotlib.text.Text at 0x10ed72278>



In [77]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5a.methylation_difference # dependent variable
X = pairs5a.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_B1_24', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_B1_24', predict \delta methylation






    Out[77]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.027


  Model:                       OLS             Adj. R-squared:        0.011


  Method:                 Least Squares        F-statistic:           1.731


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):    0.162 


  Time:                     16:05:35           Log-Likelihood:       380.60


  No. Observations:             190            AIC:                  -753.2


  Df Residuals:                 186            BIC:                  -740.2


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -26.8879     20.939     -1.284   0.201    -68.196    14.420


  avgReadCpGs_mean     -0.0882      0.039     -2.271   0.024     -0.165    -0.012


  Unique_CpGs_mean  -1.628e-08   2.19e-08     -0.743   0.458  -5.95e-08   2.7e-08


  bsRate_mean          28.5286     21.926      1.301   0.195    -14.726    71.783




  Omnibus:        14.614    Durbin-Watson:         1.626


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     10.052


  Skew:            0.437    Prob(JB):            0.00656


  Kurtosis:        2.288    Cond. No.           5.83e+09



In [78]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x10f086550>



In [79]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x10f2caac8>



In [80]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[80]:





<matplotlib.text.Text at 0x10f5d44a8>



In [81]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5b.methylation_difference # dependent variable
X = pairs5b.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_C1_24', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_C1_24', predict \delta methylation






    Out[81]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.016


  Model:                       OLS             Adj. R-squared:        0.003


  Method:                 Least Squares        F-statistic:           1.244


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):    0.295 


  Time:                     16:05:38           Log-Likelihood:       470.32


  No. Observations:             231            AIC:                  -932.6


  Df Residuals:                 227            BIC:                  -918.9


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -10.8882     17.324     -0.629   0.530    -45.024    23.248


  avgReadCpGs_mean     -0.0617      0.047     -1.304   0.194     -0.155     0.032


  Unique_CpGs_mean  -1.209e-08   1.04e-08     -1.162   0.247  -3.26e-08  8.42e-09


  bsRate_mean          11.7216     18.158      0.646   0.519    -24.057    47.501




  Omnibus:        19.896    Durbin-Watson:         1.005


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     16.257


  Skew:            0.558    Prob(JB):           0.000295


  Kurtosis:        2.335    Cond. No.           9.25e+09



In [82]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x10f831518>



In [83]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x10fa4f9b0>



In [84]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[84]:





<matplotlib.text.Text at 0x10fd2bf60>



In [85]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5c.methylation_difference # dependent variable
X = pairs5c.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_D1_24', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_D1_24', predict \delta methylation






    Out[85]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.241


  Model:                       OLS             Adj. R-squared:        0.170


  Method:                 Least Squares        F-statistic:           3.383


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.0300 


  Time:                     16:05:41           Log-Likelihood:       85.716


  No. Observations:              36            AIC:                  -163.4


  Df Residuals:                  32            BIC:                  -157.1


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                17.9503     27.055      0.663   0.512    -37.158    73.059


  avgReadCpGs_mean     -0.1749      0.058     -3.004   0.005     -0.294    -0.056


  Unique_CpGs_mean   3.622e-08   2.71e-08      1.335   0.191   -1.9e-08  9.15e-08


  bsRate_mean         -17.6555     28.176     -0.627   0.535    -75.048    39.736




  Omnibus:         2.534    Durbin-Watson:         2.132


  Prob(Omnibus):   0.282    Jarque-Bera (JB):      2.093


  Skew:           -0.585    Prob(JB):              0.351


  Kurtosis:        2.843    Cond. No.           7.43e+09



In [86]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_D1_24 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x1100546d8>



In [87]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_D1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x110326da0>



In [88]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_D1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[88]:





<matplotlib.text.Text at 0x11060c278>



In [89]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5d = pd.merge(out, methylation_differences, how='inner')
pairs5d = pairs5d.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5d.methylation_difference # dependent variable
X = pairs5d.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_G1_22', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_G1_22', predict \delta methylation






    Out[89]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.059


  Model:                       OLS             Adj. R-squared:        0.042


  Method:                 Least Squares        F-statistic:           3.492


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.0170 


  Time:                     16:05:44           Log-Likelihood:       342.48


  No. Observations:             171            AIC:                  -677.0


  Df Residuals:                 167            BIC:                  -664.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 6.1267     15.422      0.397   0.692    -24.320    36.573


  avgReadCpGs_mean      0.1638      0.057      2.892   0.004      0.052     0.276


  Unique_CpGs_mean   1.148e-08   2.38e-08      0.483   0.630  -3.54e-08  5.84e-08


  bsRate_mean          -7.2579     16.238     -0.447   0.655    -39.317    24.801




  Omnibus:         9.148    Durbin-Watson:         1.747


  Prob(Omnibus):   0.010    Jarque-Bera (JB):      9.707


  Skew:            0.567    Prob(JB):            0.00780


  Kurtosis:        2.727    Cond. No.           5.53e+09



In [90]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_G1_22 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x110906860>



In [91]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_G1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x110b69518>



In [92]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5d, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_G1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[92]:





<matplotlib.text.Text at 0x110f18710>



In [93]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5e = pd.merge(out, methylation_differences, how='inner')
pairs5e = pairs5e.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5e.methylation_difference # dependent variable
X = pairs5e.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_H1_22', predict \delta methylation")
est.summary()









    



Regression results for 'normal_B_cell_H1_22', predict \delta methylation






    Out[93]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.071


  Model:                       OLS             Adj. R-squared:        0.047


  Method:                 Least Squares        F-statistic:           2.955


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):   0.0355 


  Time:                     16:05:46           Log-Likelihood:       246.39


  No. Observations:             120            AIC:                  -484.8


  Df Residuals:                 116            BIC:                  -473.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                26.9121     24.688      1.090   0.278    -21.986    75.811


  avgReadCpGs_mean      0.1334      0.057      2.350   0.020      0.021     0.246


  Unique_CpGs_mean  -3.643e-09   3.36e-08     -0.108   0.914  -7.02e-08  6.29e-08


  bsRate_mean         -28.6968     25.769     -1.114   0.268    -79.736    22.342




  Omnibus:         6.663    Durbin-Watson:         1.619


  Prob(Omnibus):   0.036    Jarque-Bera (JB):      6.817


  Skew:            0.582    Prob(JB):             0.0331


  Kurtosis:        2.906    Cond. No.           6.56e+09



In [94]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[94]:





<matplotlib.text.Text at 0x111186e10>



In [95]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=pairs5e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x1113cbb38>



In [96]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=pairs5e, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[96]:





<matplotlib.text.Text at 0x111617978>



In [97]:

    
pairs['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('CLL')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs5d['type'] = str('normal')
pairs5e['type'] = str('normal')

frames22 = [pairs, pairs2, pairs3, pairs4]
total_pairs = pd.concat(frames22)
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(['methylation_unweighted', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)

categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation






    Out[97]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.307


  Model:                       OLS             Adj. R-squared:        0.303


  Method:                 Least Squares        F-statistic:           79.82


  Date:                 Fri, 08 Jul 2016       Prob (F-statistic):  4.47e-56


  Time:                     16:05:49           Log-Likelihood:       1805.8


  No. Observations:             726            AIC:                  -3602.


  Df Residuals:                 721            BIC:                  -3579.


  Df Model:                       4                                        


  Covariance Type:          nonrobust                                      




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -0.6997      0.105     -6.643   0.000     -0.907    -0.493


  avgReadCpGs_mean      0.0514      0.009      5.531   0.000      0.033     0.070


  Unique_CpGs_mean  -5.159e-08   3.29e-09    -15.683   0.000  -5.81e-08 -4.51e-08


  bsRate_mean           0.4734      0.069      6.822   0.000      0.337     0.610


  type_CLL              0.0102      0.002      4.099   0.000      0.005     0.015




  Omnibus:        104.826    Durbin-Watson:         1.463


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    181.497


  Skew:            0.897     Prob(JB):           3.88e-40


  Kurtosis:        4.667     Cond. No.           9.50e+07



In [98]:

    
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.text.Text at 0x11188dd30>



In [99]:

    
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x111a92550>



In [100]:

    
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[100]:





<matplotlib.text.Text at 0x111d26400>



In [101]:

    
sns.lmplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, hue='type')









    Out[101]:





<seaborn.axisgrid.FacetGrid at 0x112009780>



In [102]:

    
sns.lmplot(x="avgReadCpGs_mean", y="methylation_difference",  data=total_pairs, hue='type')









    Out[102]:





<seaborn.axisgrid.FacetGrid at 0x11204d390>



In [103]:

    
sns.lmplot(x="Unique_CpGs_mean", y="methylation_difference",  data=total_pairs, hue='type')









    Out[103]:





<seaborn.axisgrid.FacetGrid at 0x111a63c88>



In [104]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.913346539006



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	Unnamed: 0	filename	PDR_unweighted	methylation_unweighted	total_reads
0	0	RRBS_trito_pool_1_TAAGGCGA.ACAACC.dan.annoRRBS...	0.357578	0.648881	21089805.0
1	1	RRBS_trito_pool_1_TAAGGCGA.ACGTGG.dan.annoRRBS...	0.358350	0.642320	12424887.0
2	2	RRBS_trito_pool_1_TAAGGCGA.ACTCAC.dan.annoRRBS...	0.353506	0.649909	18443935.0
3	3	RRBS_trito_pool_1_TAAGGCGA.ATAGCG.dan.annoRRBS...	0.357433	0.635992	12731278.0
4	4	RRBS_trito_pool_1_TAAGGCGA.ATCGAC.dan.annoRRBS...	0.359219	0.647797	20736297.0

	filename	PDR_unweighted	methylation_unweighted	total_reads	type	bio	protocol
0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC	0.254835	0.691996	11894660.0	normal	normal_B	normal_B_cell_A1_24
1	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG	0.390562	0.620106	3744659.0	normal	normal_B	normal_B_cell_A1_24
2	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG	0.266418	0.699736	10461874.0	normal	normal_B	normal_B_cell_A1_24
3	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC	0.265385	0.763173	14051.0	normal	normal_B	normal_B_cell_A1_24
4	RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG	0.240201	0.732036	21928743.0	normal	normal_B	normal_B_cell_A1_24

	filename	totCpG	bsRate
0	RRBS_trito_pool_1_TAAGGCGA.ACAACC	995714	0.980280
1	RRBS_trito_pool_1_TAAGGCGA.ACGTGG	705787	0.980081
2	RRBS_trito_pool_1_TAAGGCGA.ACTCAC	865744	0.980305
3	RRBS_trito_pool_1_TAAGGCGA.AGGATG	955160	0.980392
4	RRBS_trito_pool_1_TAAGGCGA.ATAGCG	634455	0.980256

Dep. Variable:	methylation_unweighted	R-squared:	0.457
Model:	OLS	Adj. R-squared:	0.451
Method:	Least Squares	F-statistic:	74.21
Date:	Fri, 08 Jul 2016	Prob (F-statistic):	1.36e-45
Time:	16:04:53	Log-Likelihood:	746.26
No. Observations:	358	AIC:	-1483.
Df Residuals:	353	BIC:	-1463.
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	1.4335	0.081	17.720	0.000	1.274 1.593
avgReadCpGs_mean	-0.0471	0.012	-3.851	0.000	-0.071 -0.023
Unique_CpGs	5.801e-08	6.7e-09	8.653	0.000	4.48e-08 7.12e-08
bsRate	-1.3260	0.102	-12.938	0.000	-1.528 -1.124
type_CLL	0.7097	0.040	17.715	0.000	0.631 0.789
type_normal	0.7238	0.041	17.685	0.000	0.643 0.804

Omnibus:	4.079	Durbin-Watson:	1.171
Prob(Omnibus):	0.130	Jarque-Bera (JB):	3.851
Skew:	-0.247	Prob(JB):	0.146
Kurtosis:	3.116	Cond. No.	6.18e+20

Dep. Variable:	methylation_difference	R-squared:	0.003
Model:	OLS	Adj. R-squared:	-0.011
Method:	Least Squares	F-statistic:	0.2122
Date:	Fri, 08 Jul 2016	Prob (F-statistic):	0.888
Time:	16:04:53	Log-Likelihood:	800.81
No. Observations:	210	AIC:	-1594.
Df Residuals:	206	BIC:	-1580.
Df Model:	3
Covariance Type:	nonrobust

Omnibus:	12.077	Durbin-Watson:	2.275
Prob(Omnibus):	0.002	Jarque-Bera (JB):	11.720
Skew:	0.527	Prob(JB):	0.00285
Kurtosis:	2.521	Cond. No.	1.72e+10

Omnibus:	27.220	Durbin-Watson:	1.512
Prob(Omnibus):	0.000	Jarque-Bera (JB):	32.640
Skew:	0.854	Prob(JB):	8.17e-08
Kurtosis:	3.425	Cond. No.	1.24e+10

Omnibus:	10.871	Durbin-Watson:	1.441
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.619
Skew:	0.576	Prob(JB):	0.00300
Kurtosis:	2.945	Cond. No.	3.92e+09

Omnibus:	5.666	Durbin-Watson:	2.121
Prob(Omnibus):	0.059	Jarque-Bera (JB):	4.628
Skew:	0.266	Prob(JB):	0.0988
Kurtosis:	2.504	Cond. No.	5.05e+09

Omnibus:	1.275	Durbin-Watson:	1.338
Prob(Omnibus):	0.529	Jarque-Bera (JB):	1.320
Skew:	0.121	Prob(JB):	0.517
Kurtosis:	2.696	Cond. No.	5.01e+09

Omnibus:	8.971	Durbin-Watson:	1.528
Prob(Omnibus):	0.011	Jarque-Bera (JB):	9.321
Skew:	0.604	Prob(JB):	0.00946
Kurtosis:	3.024	Cond. No.	9.28e+09

Omnibus:	10.109	Durbin-Watson:	1.829
Prob(Omnibus):	0.006	Jarque-Bera (JB):	10.757
Skew:	0.524	Prob(JB):	0.00461
Kurtosis:	2.863	Cond. No.	1.28e+10

Omnibus:	1.435	Durbin-Watson:	2.482
Prob(Omnibus):	0.488	Jarque-Bera (JB):	1.359
Skew:	-0.429	Prob(JB):	0.507
Kurtosis:	2.590	Cond. No.	3.46e+10

Omnibus:	26.255	Durbin-Watson:	0.981
Prob(Omnibus):	0.000	Jarque-Bera (JB):	32.164
Skew:	0.937	Prob(JB):	1.04e-07
Kurtosis:	3.404	Cond. No.	1.06e+10

Omnibus:	14.563	Durbin-Watson:	1.334
Prob(Omnibus):	0.001	Jarque-Bera (JB):	7.216
Skew:	0.330	Prob(JB):	0.0271
Kurtosis:	2.165	Cond. No.	1.65e+10

Omnibus:	9.337	Durbin-Watson:	1.399
Prob(Omnibus):	0.009	Jarque-Bera (JB):	9.857
Skew:	0.588	Prob(JB):	0.00724
Kurtosis:	2.948	Cond. No.	1.91e+10

Omnibus:	3.998	Durbin-Watson:	1.192
Prob(Omnibus):	0.135	Jarque-Bera (JB):	3.913
Skew:	0.414	Prob(JB):	0.141
Kurtosis:	2.925	Cond. No.	1.55e+10

Omnibus:	7.984	Durbin-Watson:	1.882
Prob(Omnibus):	0.018	Jarque-Bera (JB):	8.027
Skew:	0.477	Prob(JB):	0.0181
Kurtosis:	3.085	Cond. No.	2.04e+10

Omnibus:	12.503	Durbin-Watson:	1.687
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.768
Skew:	0.657	Prob(JB):	0.00102
Kurtosis:	2.892	Cond. No.	1.17e+09

Omnibus:	14.614	Durbin-Watson:	1.626
Prob(Omnibus):	0.001	Jarque-Bera (JB):	10.052
Skew:	0.437	Prob(JB):	0.00656
Kurtosis:	2.288	Cond. No.	5.83e+09

Omnibus:	19.896	Durbin-Watson:	1.005
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.257
Skew:	0.558	Prob(JB):	0.000295
Kurtosis:	2.335	Cond. No.	9.25e+09

Omnibus:	2.534	Durbin-Watson:	2.132
Prob(Omnibus):	0.282	Jarque-Bera (JB):	2.093
Skew:	-0.585	Prob(JB):	0.351
Kurtosis:	2.843	Cond. No.	7.43e+09

Omnibus:	9.148	Durbin-Watson:	1.747
Prob(Omnibus):	0.010	Jarque-Bera (JB):	9.707
Skew:	0.567	Prob(JB):	0.00780
Kurtosis:	2.727	Cond. No.	5.53e+09

Omnibus:	6.663	Durbin-Watson:	1.619
Prob(Omnibus):	0.036	Jarque-Bera (JB):	6.817
Skew:	0.582	Prob(JB):	0.0331
Kurtosis:	2.906	Cond. No.	6.56e+09

Omnibus:	104.826	Durbin-Watson:	1.463
Prob(Omnibus):	0.000	Jarque-Bera (JB):	181.497
Skew:	0.897	Prob(JB):	3.88e-40
Kurtosis:	4.667	Cond. No.	9.50e+07