notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm



In [4]:

    
stats = pd.read_csv("all_RRBS_statistics_final.csv")



In [5]:

    
stats.shape









    Out[5]:





(438, 15)



In [6]:

    
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]



In [7]:

    
len(normal)









    Out[7]:





336



In [8]:

    
len(CLL)









    Out[8]:





102



In [9]:

    
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")



In [10]:

    
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)



In [11]:

    
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])



In [12]:

    
cpg_total.shape









    Out[12]:





(513, 4)



In [13]:

    
merged = stats.merge(cpg_total, on="filename")



In [14]:

    
merged.shape









    Out[14]:





(438, 18)



In [15]:

    
merged.head()









    Out[15]:






  
    
      
      filename
      methylation
      PDR_total
      methylation_unweighted
      PDR_unweighted
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      bsRate
      avgReadCpgs_nofilter
      avgReadCpgs_lessthan1CpG
      avgReadCpgs_gtreql3.8CpG
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC
      0.529505
      0.235795
      0.632802
      0.231878
      2208325.0
      983394.0
      4170549.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      525282.0
      525251.0
      435636.0
      0.9975
      5.354284
      5.355660
      7.019255
    
    
      1
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG
      0.455550
      0.177631
      0.583859
      0.175371
      733064.0
      285841.0
      1609185.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      221972.0
      221962.0
      186757.0
      0.9975
      5.587294
      5.588449
      7.302612
    
    
      2
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG
      0.515269
      0.177645
      0.618578
      0.174221
      1452802.0
      500870.0
      2819500.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      355730.0
      355713.0
      295624.0
      0.9975
      5.393199
      5.394331
      7.079288
    
    
      3
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG
      0.556175
      0.176367
      0.652727
      0.172273
      2279354.0
      722800.0
      4098270.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      483179.0
      483150.0
      397812.0
      0.9975
      5.287116
      5.288477
      6.979525
    
    
      4
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC
      0.528642
      0.181331
      0.640401
      0.172287
      1394208.0
      478231.0
      2637340.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      356122.0
      356100.0
      294065.0
      0.9975
      5.314302
      5.315719
      6.995052



In [16]:

    
sns.lmplot(x="bsRate", y="methylation",  data=merged)
plt.title("methylation (weighted) vs bisulfite conversion rate")









    Out[16]:





<matplotlib.text.Text at 0x10d7cf160>



In [17]:

    
sns.lmplot(x="bsRate", y="methylation_unweighted",  data=merged)
plt.title("methylation (unweighted) vs bisulfite conversion rate")









    Out[17]:





<matplotlib.text.Text at 0x10d898198>



In [18]:

    
sns.lmplot(x="bsRate", y="PDR_total",  data=merged)
plt.title("PDR (weighted) vs bisulfite conversion rate")









    Out[18]:





<matplotlib.text.Text at 0x10dabaef0>



In [19]:

    
sns.lmplot(x="bsRate", y="PDR_unweighted",  data=merged)
plt.title("PDR (unweighted) vs bisulfite conversion rate")









    Out[19]:





<matplotlib.text.Text at 0x10dbc4c18>



In [20]:

    
sns.lmplot(x="bsRate", y="total_cpg_no_filter",  data=merged)
plt.title("Total # unique CpGs per cell (no filter) vs bisulfite conversion rate")









    Out[20]:





<matplotlib.text.Text at 0x10e4fdb00>



In [21]:

    
sns.lmplot(x="bsRate", y="total_cpg_gtrthan1",  data=merged)
plt.title("Total # unique CpGs per cell (filter: > 1) vs bisulfite conversion rate")









    Out[21]:





<matplotlib.text.Text at 0x10e603cf8>



In [22]:

    
sns.lmplot(x="bsRate", y="total_cpg_gtrthan38",  data=merged)
plt.title("Total # unique CpGs per cell (filter: >= 3.8) vs bisulfite conversion rate")









    Out[22]:





<matplotlib.text.Text at 0x10e7096d8>



In [23]:

    
sns.lmplot(x="bsRate", y="avgReadCpgs_nofilter",  data=merged)
plt.title("Mean avgCpG read per cell (no filter) vs bisulfite conversion rate")









    Out[23]:





<matplotlib.text.Text at 0x10e895a20>



In [24]:

    
sns.lmplot(x="bsRate", y="avgReadCpgs_lessthan1CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: > 1) vs bisulfite conversion rate")









    Out[24]:





<matplotlib.text.Text at 0x10e8fcac8>



In [25]:

    
sns.lmplot(x="bsRate", y="avgReadCpgs_gtreql3.8CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: >= 3.8) vs bisulfite conversion rate")









    Out[25]:





<matplotlib.text.Text at 0x10eb23400>



In [ ]:



In [ ]:



In [26]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1.methylation_difference # dependent variable to predict

X = pairs1.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[26]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.051


  Model:                       OLS             Adj. R-squared:        0.037


  Method:                 Least Squares        F-statistic:           3.694


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   0.0127 


  Time:                     18:36:00           Log-Likelihood:       634.72


  No. Observations:             210            AIC:                  -1261.


  Df Residuals:                 206            BIC:                  -1248.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -30.3253     11.387     -2.663   0.008    -52.775    -7.875


  total_cpg_no_filter  -3.279e-08   1.12e-08     -2.915   0.004   -5.5e-08 -1.06e-08


  bsRate_mean             30.6098     11.611      2.636   0.009      7.719    53.501


  avgReadCpG_mean          0.0516      0.029      1.793   0.075     -0.005     0.108




  Omnibus:        35.998    Durbin-Watson:         2.175


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.728


  Skew:            0.253    Prob(JB):            0.00468


  Kurtosis:        2.015    Cond. No.           1.46e+10



In [27]:

    
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x10ed2c978>



In [28]:

    
print("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[28]:





<matplotlib.text.Text at 0x10f027ef0>



In [29]:

    
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x10f33ed68>



In [30]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1a.methylation_difference # dependent variable to predict

X = pairs1a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[30]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.174


  Model:                       OLS             Adj. R-squared:        0.162


  Method:                 Least Squares        F-statistic:           14.46


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.39e-08


  Time:                     18:36:04           Log-Likelihood:       664.61


  No. Observations:             210            AIC:                  -1321.


  Df Residuals:                 206            BIC:                  -1308.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -50.2686     11.460     -4.386   0.000    -72.862   -27.675


  total_cpg_no_filter  -5.383e-08   9.12e-09     -5.900   0.000  -7.18e-08 -3.58e-08


  bsRate_mean             51.5942     11.885      4.341   0.000     28.162    75.026


  avgReadCpG_mean         -0.0031      0.029     -0.106   0.916     -0.060     0.054




  Omnibus:        24.923    Durbin-Watson:         2.253


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      7.923


  Skew:            0.135    Prob(JB):             0.0190


  Kurtosis:        2.088    Cond. No.           1.68e+10



In [31]:

    
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x10f5ab2b0>



In [32]:

    
print("methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[32]:





<matplotlib.text.Text at 0x10f7eb9b0>



In [33]:

    
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x10faeac18>



In [ ]:



In [34]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2.methylation_difference # dependent variable to predict

X = pairs2.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[34]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.302


  Model:                       OLS             Adj. R-squared:        0.291


  Method:                 Least Squares        F-statistic:           26.82


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.84e-14


  Time:                     18:36:09           Log-Likelihood:       443.80


  No. Observations:             190            AIC:                  -879.6


  Df Residuals:                 186            BIC:                  -866.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   29.9982     10.370      2.893   0.004      9.541    50.456


  total_cpg_no_filter   -8.72e-08   1.76e-08     -4.962   0.000  -1.22e-07 -5.25e-08


  bsRate_mean            -32.8258     10.856     -3.024   0.003    -54.242   -11.409


  avgReadCpG_mean          0.2277      0.033      6.904   0.000      0.163     0.293




  Omnibus:         8.958    Durbin-Watson:         1.474


  Prob(Omnibus):   0.011    Jarque-Bera (JB):     12.260


  Skew:            0.301    Prob(JB):            0.00218


  Kurtosis:        4.089    Cond. No.           3.27e+09



In [35]:

    
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")









    



methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x10fe46780>



In [36]:

    
print("methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")









    



methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[36]:





<matplotlib.text.Text at 0x11001d7b8>



In [37]:

    
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")









    



methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x110274860>



In [38]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2a.methylation_difference # dependent variable to predict

X = pairs2a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[38]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.169


  Model:                       OLS             Adj. R-squared:        0.156


  Method:                 Least Squares        F-statistic:           12.63


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.50e-07


  Time:                     18:36:13           Log-Likelihood:       460.18


  No. Observations:             190            AIC:                  -912.4


  Df Residuals:                 186            BIC:                  -899.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -40.9702     10.552     -3.883   0.000    -61.787   -20.154


  total_cpg_no_filter  -5.822e-08    1.3e-08     -4.465   0.000  -8.39e-08 -3.25e-08


  bsRate_mean             42.6072     11.072      3.848   0.000     20.765    64.449


  avgReadCpG_mean          0.0120      0.022      0.557   0.578     -0.031     0.055




  Omnibus:         3.281    Durbin-Watson:         2.393


  Prob(Omnibus):   0.194    Jarque-Bera (JB):      2.603


  Skew:           -0.160    Prob(JB):              0.272


  Kurtosis:        2.524    Cond. No.           4.47e+09



In [39]:

    
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x110501160>



In [40]:

    
print("methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[40]:





<matplotlib.text.Text at 0x1102ac2e8>



In [41]:

    
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")









    



methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[41]:





<matplotlib.text.Text at 0x1109756a0>



In [ ]:



In [ ]:



In [42]:

    
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2b.methylation_difference # dependent variable to predict

X = pairs2b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[42]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.097


  Model:                       OLS             Adj. R-squared:        0.083


  Method:                 Least Squares        F-statistic:           6.674


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  0.000265


  Time:                     18:36:16           Log-Likelihood:       514.91


  No. Observations:             190            AIC:                  -1022.


  Df Residuals:                 186            BIC:                  -1009.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -37.7684      9.986     -3.782   0.000    -57.469   -18.068


  total_cpg_no_filter  -4.613e-08   1.24e-08     -3.717   0.000  -7.06e-08 -2.16e-08


  bsRate_mean             38.9209     10.353      3.759   0.000     18.497    59.345


  avgReadCpG_mean          0.0417      0.034      1.237   0.218     -0.025     0.108




  Omnibus:         7.301    Durbin-Watson:         1.539


  Prob(Omnibus):   0.026    Jarque-Bera (JB):      6.896


  Skew:            0.411    Prob(JB):             0.0318


  Kurtosis:        2.557    Cond. No.           6.96e+09



In [43]:

    
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x110cfd748>



In [44]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")









    



methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[44]:





<matplotlib.text.Text at 0x110f42240>



In [45]:

    
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[45]:





<matplotlib.text.Text at 0x11118b6a0>



In [ ]:



In [46]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3.methylation_difference # dependent variable to predict

X = pairs3.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[46]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.176


  Model:                       OLS             Adj. R-squared:        0.158


  Method:                 Least Squares        F-statistic:           9.422


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.10e-05


  Time:                     18:36:20           Log-Likelihood:       329.50


  No. Observations:             136            AIC:                  -651.0


  Df Residuals:                 132            BIC:                  -639.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -80.5968     31.693     -2.543   0.012   -143.288   -17.905


  total_cpg_no_filter   1.518e-07   3.97e-08      3.825   0.000   7.33e-08   2.3e-07


  bsRate_mean             79.4942     31.773      2.502   0.014     16.644   142.344


  avgReadCpG_mean          0.1876      0.041      4.554   0.000      0.106     0.269




  Omnibus:         3.415    Durbin-Watson:         1.088


  Prob(Omnibus):   0.181    Jarque-Bera (JB):      3.118


  Skew:            0.293    Prob(JB):              0.210


  Kurtosis:        2.545    Cond. No.           7.56e+09



In [47]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x1113ab470>



In [48]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")









    



methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[48]:





<matplotlib.text.Text at 0x11160dcf8>



In [49]:

    
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.text.Text at 0x1119e6470>



In [ ]:



In [50]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3a.methylation_difference # dependent variable to predict

X = pairs3a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[50]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.191


  Model:                       OLS             Adj. R-squared:        0.180


  Method:                 Least Squares        F-statistic:           17.89


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.87e-10


  Time:                     18:36:23           Log-Likelihood:       523.58


  No. Observations:             231            AIC:                  -1039.


  Df Residuals:                 227            BIC:                  -1025.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -79.0053     42.215     -1.872   0.063   -162.188     4.177


  total_cpg_no_filter    8.53e-08   3.24e-08      2.633   0.009   2.15e-08  1.49e-07


  bsRate_mean             78.2656     42.287      1.851   0.065     -5.060   161.591


  avgReadCpG_mean          0.1346      0.020      6.857   0.000      0.096     0.173




  Omnibus:         8.090    Durbin-Watson:         1.492


  Prob(Omnibus):   0.018    Jarque-Bera (JB):      8.393


  Skew:            0.465    Prob(JB):             0.0151


  Kurtosis:        2.921    Cond. No.           1.03e+10



In [51]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x111d5c780>



In [52]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")









    



methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[52]:





<matplotlib.text.Text at 0x111f9be10>



In [53]:

    
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[53]:





<matplotlib.text.Text at 0x1121a5390>



In [ ]:



In [54]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3b.methylation_difference # dependent variable to predict

X = pairs3b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[54]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.422


  Model:                       OLS             Adj. R-squared:        0.368


  Method:                 Least Squares        F-statistic:           7.792


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  0.000482


  Time:                     18:36:27           Log-Likelihood:       85.081


  No. Observations:              36            AIC:                  -162.2


  Df Residuals:                  32            BIC:                  -155.8


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  371.9341    197.834      1.880   0.069    -31.040   774.909


  total_cpg_no_filter  -1.834e-08   2.82e-08     -0.650   0.520  -7.58e-08  3.91e-08


  bsRate_mean           -373.2763    197.958     -1.886   0.068   -776.504    29.952


  avgReadCpG_mean          0.0663      0.063      1.045   0.304     -0.063     0.195




  Omnibus:         4.895    Durbin-Watson:         1.943


  Prob(Omnibus):   0.086    Jarque-Bera (JB):      3.450


  Skew:           -0.676    Prob(JB):              0.178


  Kurtosis:        3.687    Cond. No.           3.29e+10



In [55]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x1124ce470>



In [56]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")









    



methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[56]:





<matplotlib.text.Text at 0x1126fbd68>



In [57]:

    
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[57]:





<matplotlib.text.Text at 0x112ad05f8>



In [ ]:



In [ ]:



In [58]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3c.methylation_difference # dependent variable to predict

X = pairs3c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[58]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.047


  Model:                       OLS             Adj. R-squared:        0.031


  Method:                 Least Squares        F-statistic:           3.045


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   0.0301 


  Time:                     18:36:30           Log-Likelihood:       524.85


  No. Observations:             190            AIC:                  -1042.


  Df Residuals:                 186            BIC:                  -1029.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   62.9080     23.818      2.641   0.009     15.920   109.896


  total_cpg_no_filter   3.265e-08   1.96e-08      1.667   0.097     -6e-09  7.13e-08


  bsRate_mean            -63.0209     23.812     -2.647   0.009   -109.998   -16.044


  avgReadCpG_mean         -0.0074      0.026     -0.290   0.772     -0.058     0.043




  Omnibus:        13.771    Durbin-Watson:         1.488


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     15.195


  Skew:            0.693    Prob(JB):           0.000502


  Kurtosis:        3.023    Cond. No.           8.48e+09



In [59]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x112d44c18>



In [60]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[60]:





<matplotlib.text.Text at 0x113053f28>



In [61]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[61]:





<matplotlib.text.Text at 0x1132cee80>



In [ ]:



In [ ]:



In [62]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4.methylation_difference # dependent variable to predict

X = pairs4.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[62]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.475


  Model:                       OLS             Adj. R-squared:        0.464


  Method:                 Least Squares        F-statistic:           44.88


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.01e-20


  Time:                     18:36:34           Log-Likelihood:       358.78


  No. Observations:             153            AIC:                  -709.6


  Df Residuals:                 149            BIC:                  -697.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   31.8745     63.262      0.504   0.615    -93.132   156.881


  total_cpg_no_filter   2.164e-07   3.49e-08      6.208   0.000   1.48e-07  2.85e-07


  bsRate_mean            -34.4878     63.575     -0.542   0.588   -160.112    91.137


  avgReadCpG_mean          0.3535      0.037      9.641   0.000      0.281     0.426




  Omnibus:         1.604    Durbin-Watson:         1.420


  Prob(Omnibus):   0.448    Jarque-Bera (JB):      1.661


  Skew:            0.201    Prob(JB):              0.436


  Kurtosis:        2.685    Cond. No.           1.52e+10



In [63]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x113502f60>



In [64]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")









    



methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[64]:





<matplotlib.text.Text at 0x1137618d0>



In [65]:

    
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[65]:





<matplotlib.text.Text at 0x113a86dd8>



In [ ]:



In [66]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4a.methylation_difference # dependent variable to predict

X = pairs4a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[66]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.171


  Model:                       OLS             Adj. R-squared:        0.154


  Method:                 Least Squares        F-statistic:           10.21


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  3.72e-06


  Time:                     18:36:37           Log-Likelihood:       406.36


  No. Observations:             153            AIC:                  -804.7


  Df Residuals:                 149            BIC:                  -792.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -124.8072     52.266     -2.388   0.018   -228.086   -21.529


  total_cpg_no_filter   1.656e-07    3.5e-08      4.733   0.000   9.65e-08  2.35e-07


  bsRate_mean            124.5439     52.412      2.376   0.019     20.978   228.110


  avgReadCpG_mean          0.0759      0.035      2.139   0.034      0.006     0.146




  Omnibus:         6.175    Durbin-Watson:         1.448


  Prob(Omnibus):   0.046    Jarque-Bera (JB):      6.310


  Skew:            0.494    Prob(JB):             0.0426


  Kurtosis:        2.888    Cond. No.           1.55e+10



In [67]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x113dd79e8>



In [68]:

    
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")









    



methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[68]:





<matplotlib.text.Text at 0x113f49048>



In [69]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[69]:





<matplotlib.text.Text at 0x114276f28>



In [ ]:



In [70]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4b.methylation_difference # dependent variable to predict

X = pairs4b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[70]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.336


  Model:                       OLS             Adj. R-squared:        0.321


  Method:                 Least Squares        F-statistic:           22.30


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  9.63e-12


  Time:                     18:36:41           Log-Likelihood:       309.52


  No. Observations:             136            AIC:                  -611.0


  Df Residuals:                 132            BIC:                  -599.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.4083     66.568      0.006   0.995   -131.270   132.086


  total_cpg_no_filter   3.337e-07   5.76e-08      5.794   0.000    2.2e-07  4.48e-07


  bsRate_mean             -2.8663     66.672     -0.043   0.966   -134.749   129.017


  avgReadCpG_mean          0.3437      0.043      7.960   0.000      0.258     0.429




  Omnibus:         3.749    Durbin-Watson:         1.332


  Prob(Omnibus):   0.153    Jarque-Bera (JB):      3.798


  Skew:            0.389    Prob(JB):              0.150


  Kurtosis:        2.745    Cond. No.           1.28e+10



In [71]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x1145cbd68>



In [72]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[72]:





<matplotlib.text.Text at 0x11389bbe0>



In [73]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[73]:





<matplotlib.text.Text at 0x114ab2320>



In [ ]:



In [74]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4c.methylation_difference # dependent variable to predict

X = pairs4c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[74]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.163


  Model:                       OLS             Adj. R-squared:        0.151


  Method:                 Least Squares        F-statistic:           13.41


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  5.01e-08


  Time:                     18:36:45           Log-Likelihood:       462.71


  No. Observations:             210            AIC:                  -917.4


  Df Residuals:                 206            BIC:                  -904.0


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -273.3499    119.265     -2.292   0.023   -508.486   -38.213


  total_cpg_no_filter   1.556e-07   4.42e-08      3.522   0.001   6.85e-08  2.43e-07


  bsRate_mean            272.5222    119.451      2.281   0.024     37.019   508.025


  avgReadCpG_mean          0.2106      0.036      5.914   0.000      0.140     0.281




  Omnibus:         3.871    Durbin-Watson:         2.144


  Prob(Omnibus):   0.144    Jarque-Bera (JB):      3.833


  Skew:            0.290    Prob(JB):              0.147


  Kurtosis:        2.682    Cond. No.           3.13e+10



In [75]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x114e1eb70>



In [76]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[76]:





<matplotlib.text.Text at 0x114ff6cf8>



In [77]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[77]:





<matplotlib.text.Text at 0x11523ecf8>



In [ ]:



In [78]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5.methylation_difference # dependent variable to predict

X = pairs5.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[78]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.076


  Model:                       OLS             Adj. R-squared:        0.063


  Method:                 Least Squares        F-statistic:           5.645


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  0.000977


  Time:                     18:36:49           Log-Likelihood:       457.19


  No. Observations:             210            AIC:                  -906.4


  Df Residuals:                 206            BIC:                  -893.0


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -8.5868     45.516     -0.189   0.851    -98.324    81.151


  total_cpg_no_filter  -3.285e-08   1.14e-08     -2.890   0.004  -5.53e-08 -1.04e-08


  bsRate_mean              8.0917     45.637      0.177   0.859    -81.885    98.068


  avgReadCpG_mean          0.1083      0.031      3.511   0.001      0.047     0.169




  Omnibus:        20.949    Durbin-Watson:         1.419


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     24.161


  Skew:            0.811    Prob(JB):           5.67e-06


  Kurtosis:        3.365    Cond. No.           1.48e+10



In [79]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x1155b79b0>



In [80]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[80]:





<matplotlib.text.Text at 0x1157df240>



In [81]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[81]:





<matplotlib.text.Text at 0x115a24d68>



In [ ]:



In [82]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5a.methylation_difference # dependent variable to predict

X = pairs5a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[82]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.141


  Model:                       OLS             Adj. R-squared:        0.127


  Method:                 Least Squares        F-statistic:           10.17


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  3.11e-06


  Time:                     18:36:52           Log-Likelihood:       443.89


  No. Observations:             190            AIC:                  -879.8


  Df Residuals:                 186            BIC:                  -866.8


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -49.0700     26.378     -1.860   0.064   -101.108     2.968


  total_cpg_no_filter  -3.547e-08   7.72e-09     -4.595   0.000  -5.07e-08 -2.02e-08


  bsRate_mean             48.8384     26.484      1.844   0.067     -3.409   101.086


  avgReadCpG_mean          0.0786      0.029      2.753   0.006      0.022     0.135




  Omnibus:        15.542    Durbin-Watson:         1.021


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     16.858


  Skew:            0.702    Prob(JB):           0.000218


  Kurtosis:        3.401    Cond. No.           1.63e+10



In [83]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x115e6d828>



In [84]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[84]:





<matplotlib.text.Text at 0x115f44f28>



In [85]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[85]:





<matplotlib.text.Text at 0x116274470>



In [ ]:



In [86]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5b.methylation_difference # dependent variable to predict

X = pairs5b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[86]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.136


  Model:                       OLS             Adj. R-squared:        0.125


  Method:                 Least Squares        F-statistic:           11.91


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  2.84e-07


  Time:                     18:36:57           Log-Likelihood:       562.90


  No. Observations:             231            AIC:                  -1118.


  Df Residuals:                 227            BIC:                  -1104.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  177.4453     55.841      3.178   0.002     67.413   287.478


  total_cpg_no_filter  -6.119e-09    8.4e-09     -0.728   0.467  -2.27e-08  1.04e-08


  bsRate_mean           -177.0063     55.939     -3.164   0.002   -287.232   -66.780


  avgReadCpG_mean         -0.1357      0.032     -4.237   0.000     -0.199    -0.073




  Omnibus:        12.686    Durbin-Watson:         1.478


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     13.548


  Skew:            0.592    Prob(JB):            0.00114


  Kurtosis:        3.068    Cond. No.           2.98e+10



In [87]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x1164b4be0>



In [88]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[88]:





<matplotlib.text.Text at 0x116754128>



In [89]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[89]:





<matplotlib.text.Text at 0x116b353c8>



In [ ]:



In [90]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5c.methylation_difference # dependent variable to predict

X = pairs5c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[90]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.038


  Model:                       OLS             Adj. R-squared:        0.023


  Method:                 Least Squares        F-statistic:           2.458


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   0.0643 


  Time:                     18:37:00           Log-Likelihood:       432.68


  No. Observations:             190            AIC:                  -857.4


  Df Residuals:                 186            BIC:                  -844.4


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   71.2926     50.936      1.400   0.163    -29.193   171.779


  total_cpg_no_filter  -6.118e-09   9.79e-09     -0.625   0.533  -2.54e-08  1.32e-08


  bsRate_mean            -71.7353     51.060     -1.405   0.162   -172.467    28.996


  avgReadCpG_mean          0.0637      0.025      2.537   0.012      0.014     0.113




  Omnibus:         9.227    Durbin-Watson:         1.561


  Prob(Omnibus):   0.010    Jarque-Bera (JB):      9.799


  Skew:            0.552    Prob(JB):            0.00745


  Kurtosis:        2.857    Cond. No.           1.62e+10



In [91]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x116e91be0>



In [92]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[92]:





<matplotlib.text.Text at 0x1170ec550>



In [93]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[93]:





<matplotlib.text.Text at 0x1174d96d8>



In [ ]:



In [94]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6.methylation_difference # dependent variable to predict

X = pairs6.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[94]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.076


  Model:                       OLS             Adj. R-squared:        0.059


  Method:                 Least Squares        F-statistic:           4.560


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   0.00425


  Time:                     18:37:03           Log-Likelihood:       476.34


  No. Observations:             171            AIC:                  -944.7


  Df Residuals:                 167            BIC:                  -932.1


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    6.9159      4.154      1.665   0.098     -1.286    15.117


  total_cpg_no_filter   1.243e-07   4.48e-08      2.774   0.006   3.58e-08  2.13e-07


  bsRate_mean             -6.9159      4.329     -1.598   0.112    -15.462     1.630


  avgReadCpG_mean         -0.0419      0.026     -1.628   0.105     -0.093     0.009




  Omnibus:        13.706    Durbin-Watson:         1.123


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     15.393


  Skew:            0.734    Prob(JB):           0.000454


  Kurtosis:        2.930    Cond. No.           9.65e+08



In [95]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x11783ef98>



In [96]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[96]:





<matplotlib.text.Text at 0x117a7c8d0>



In [97]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[97]:





<matplotlib.text.Text at 0x1173dd2b0>



In [ ]:



In [98]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, methylation_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6a.methylation_difference # dependent variable to predict

X = pairs6a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[98]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.006


  Model:                       OLS             Adj. R-squared:       -0.011


  Method:                 Least Squares        F-statistic:          0.3451


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):    0.793 


  Time:                     18:37:07           Log-Likelihood:       409.77


  No. Observations:             190            AIC:                  -811.5


  Df Residuals:                 186            BIC:                  -798.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -9.3140     17.013     -0.547   0.585    -42.878    24.250


  total_cpg_no_filter   1.368e-08   2.33e-08      0.587   0.558  -3.23e-08  5.97e-08


  bsRate_mean              9.8276     17.870      0.550   0.583    -25.426    45.081


  avgReadCpG_mean         -0.0135      0.040     -0.336   0.737     -0.093     0.066




  Omnibus:        11.696    Durbin-Watson:         1.748


  Prob(Omnibus):   0.003    Jarque-Bera (JB):     12.109


  Skew:            0.583    Prob(JB):            0.00235


  Kurtosis:        2.590    Cond. No.           4.47e+09



In [99]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x1180722b0>



In [100]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[100]:





<matplotlib.text.Text at 0x11836f6d8>



In [101]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[101]:





<matplotlib.text.Text at 0x1185a50f0>



In [ ]:



In [102]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, methylation_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6b.methylation_difference # dependent variable to predict

X = pairs6b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[102]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.015


  Model:                       OLS             Adj. R-squared:       -0.002


  Method:                 Least Squares        F-statistic:          0.8661


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):    0.460 


  Time:                     18:37:11           Log-Likelihood:       369.55


  No. Observations:             171            AIC:                  -731.1


  Df Residuals:                 167            BIC:                  -718.5


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -2.0674     16.912     -0.122   0.903    -35.457    31.322


  total_cpg_no_filter  -2.583e-08    1.9e-08     -1.359   0.176  -6.34e-08  1.17e-08


  bsRate_mean              2.3154     17.679      0.131   0.896    -32.588    37.219


  avgReadCpG_mean         -0.0147      0.056     -0.263   0.793     -0.125     0.096




  Omnibus:        12.063    Durbin-Watson:         1.344


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     13.236


  Skew:            0.666    Prob(JB):            0.00134


  Kurtosis:        2.713    Cond. No.           7.17e+09



In [103]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[103]:





<matplotlib.text.Text at 0x1188038d0>



In [104]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[104]:





<matplotlib.text.Text at 0x118a5ecc0>



In [105]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[105]:





<matplotlib.text.Text at 0x118e1f400>



In [ ]:



In [106]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, methylation_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6c.methylation_difference # dependent variable to predict

X = pairs6c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[106]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.009


  Model:                       OLS             Adj. R-squared:       -0.009


  Method:                 Least Squares        F-statistic:          0.4985


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):    0.684 


  Time:                     18:37:14           Log-Likelihood:       400.84


  No. Observations:             171            AIC:                  -793.7


  Df Residuals:                 167            BIC:                  -781.1


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    8.4566     10.614      0.797   0.427    -12.498    29.411


  total_cpg_no_filter  -1.428e-09   1.37e-08     -0.104   0.917  -2.85e-08  2.56e-08


  bsRate_mean             -8.6228     11.136     -0.774   0.440    -30.609    13.363


  avgReadCpG_mean         -0.0182      0.042     -0.436   0.663     -0.101     0.064




  Omnibus:        11.347    Durbin-Watson:         1.718


  Prob(Omnibus):   0.003    Jarque-Bera (JB):     12.434


  Skew:            0.653    Prob(JB):            0.00200


  Kurtosis:        2.802    Cond. No.           5.35e+09



In [107]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[107]:





<matplotlib.text.Text at 0x119076438>



In [108]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[108]:





<matplotlib.text.Text at 0x1192d20b8>



In [109]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[109]:





<matplotlib.text.Text at 0x1195e0a58>



In [ ]:



In [110]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, methylation_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6d.methylation_difference # dependent variable to predict

X = pairs6d.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[110]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.098


  Model:                       OLS             Adj. R-squared:        0.082


  Method:                 Least Squares        F-statistic:           6.041


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  0.000626


  Time:                     18:37:17           Log-Likelihood:       369.17


  No. Observations:             171            AIC:                  -730.3


  Df Residuals:                 167            BIC:                  -717.8


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   21.8211     12.056      1.810   0.072     -1.981    45.624


  total_cpg_no_filter   4.343e-08   2.49e-08      1.742   0.083   -5.8e-09  9.27e-08


  bsRate_mean            -24.4352     12.682     -1.927   0.056    -49.472     0.602


  avgReadCpG_mean          0.2367      0.057      4.184   0.000      0.125     0.348




  Omnibus:         9.120    Durbin-Watson:         1.943


  Prob(Omnibus):   0.010    Jarque-Bera (JB):      9.711


  Skew:            0.571    Prob(JB):            0.00778


  Kurtosis:        2.761    Cond. No.           4.19e+09



In [111]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[111]:





<matplotlib.text.Text at 0x119912518>



In [112]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[112]:





<matplotlib.text.Text at 0x119b5d6d8>



In [113]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[113]:





<matplotlib.text.Text at 0x119e64cf8>



In [ ]:



In [ ]:



In [ ]:



In [114]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, methylation_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6e.methylation_difference # dependent variable to predict

X = pairs6e.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[114]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.090


  Model:                       OLS             Adj. R-squared:        0.063


  Method:                 Least Squares        F-statistic:           3.315


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   0.0230 


  Time:                     18:37:21           Log-Likelihood:       217.59


  No. Observations:             105            AIC:                  -427.2


  Df Residuals:                 101            BIC:                  -416.6


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   10.1334     27.564      0.368   0.714    -44.547    64.814


  total_cpg_no_filter  -5.553e-08   4.02e-08     -1.382   0.170  -1.35e-07  2.42e-08


  bsRate_mean             -7.9851     28.418     -0.281   0.779    -64.358    48.388


  avgReadCpG_mean         -0.3432      0.113     -3.029   0.003     -0.568    -0.118




  Omnibus:         8.251    Durbin-Watson:         1.945


  Prob(Omnibus):   0.016    Jarque-Bera (JB):      8.349


  Skew:            0.689    Prob(JB):             0.0154


  Kurtosis:        3.107    Cond. No.           6.03e+09



In [115]:

    
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")









    



methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[115]:





<matplotlib.text.Text at 0x118a59630>



In [116]:

    
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")









    



methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[116]:





<matplotlib.text.Text at 0x11a27ffd0>



In [117]:

    
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")









    



methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[117]:





<matplotlib.text.Text at 0x11a4c7748>



In [118]:

    
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)









    



(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)



In [119]:

    
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]



In [120]:

    
total_pairs = pd.concat(frames)



In [121]:

    
total_pairs.shape









    Out[121]:





(4035, 15)



In [122]:

    
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)



In [123]:

    
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation






    Out[123]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:              0.043 


  Model:                       OLS             Adj. R-squared:         0.042 


  Method:                 Least Squares        F-statistic:            44.80 


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):   7.35e-37 


  Time:                     18:37:24           Log-Likelihood:        9099.1 


  No. Observations:            4035            AIC:                -1.819e+04


  Df Residuals:                4030            BIC:                -1.816e+04


  Df Model:                       4                                          


  Covariance Type:          nonrobust                                        




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.1532      0.031      4.988   0.000      0.093     0.213


  total_cpg_no_filter  -1.171e-08    2.1e-09     -5.566   0.000  -1.58e-08 -7.59e-09


  bsRate_mean             -0.1167      0.029     -4.061   0.000     -0.173    -0.060


  avgReadCpG_mean          0.0002      0.001      0.341   0.733     -0.001     0.002


  type_CLL                -0.0106      0.001     -9.594   0.000     -0.013    -0.008




  Omnibus:        578.877    Durbin-Watson:         1.457 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    883.294 


  Skew:            1.021     Prob(JB):           1.57e-192


  Kurtosis:        4.043     Cond. No.           5.10e+07



In [124]:

    
X.head()









    Out[124]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
      type_CLL
    
  
  
    
      0
      1
      709541.5
      0.980181
      7.029760
      1.0
    
    
      1
      1
      786284.0
      0.980293
      7.017405
      1.0
    
    
      2
      1
      683623.5
      0.980268
      7.039100
      1.0
    
    
      3
      1
      853200.5
      0.980262
      7.028117
      1.0
    
    
      4
      1
      842686.5
      0.980228
      7.025451
      1.0



In [125]:

    
print("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")









    



methylation_difference vs. bsRate, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[125]:





<matplotlib.text.Text at 0x11a756358>



In [126]:

    
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")









    



methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[126]:





<matplotlib.text.Text at 0x11aab3198>



In [127]:

    
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[127]:





<matplotlib.text.Text at 0x11add4898>



In [128]:

    
print("methylation_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. bsRate, by type, CLL vs normal")









    



methylation_difference vs. bsRate, by type! 






    Out[128]:





<matplotlib.text.Text at 0x11b0e6e48>



In [129]:

    
print("methylation_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")









    



methylation_difference vs. avgReadCpG per cell, by type! 






    Out[129]:





<matplotlib.text.Text at 0x11b21dba8>



In [130]:

    
print("methylation_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. total unique CpG per cell, by type, CLL vs Normal ")









    



methylation_difference vs. total unique CpG per cel, by type!






    Out[130]:





<matplotlib.text.Text at 0x11b35deb8>



In [131]:

    
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.895730689281



In [132]:

    
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)









    



(990, 15)
(3045, 15)



In [133]:

    
#
# CLL first
#
y = cll_pairs.methylation_difference # dependent variable
X = cll_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(990, 4)






    Out[133]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:             0.201


  Model:                       OLS             Adj. R-squared:        0.198


  Method:                 Least Squares        F-statistic:           82.59


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):  1.14e-47


  Time:                     18:40:03           Log-Likelihood:       2540.5


  No. Observations:             990            AIC:                  -5073.


  Df Residuals:                 986            BIC:                  -5053.


  Df Model:                       3                                        


  Covariance Type:          nonrobust                                      




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -0.1089      0.125     -0.871   0.384     -0.354     0.137


  total_cpg_no_filter  -4.165e-08   5.02e-09     -8.292   0.000  -5.15e-08 -3.18e-08


  bsRate_mean             -0.1666      0.114     -1.465   0.143     -0.390     0.057


  avgReadCpG_mean          0.0450      0.008      5.576   0.000      0.029     0.061




  Omnibus:        76.465    Durbin-Watson:         1.679


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     98.596


  Skew:            0.659    Prob(JB):           3.89e-22


  Kurtosis:        3.810    Cond. No.           1.66e+08



In [134]:

    
print("methylation_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  CLL pairs ")









    



methylation_difference vs. bsRate, jointplot, CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[134]:





<matplotlib.text.Text at 0x161c30278>



In [135]:

    
print("methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")









    



methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[135]:





<matplotlib.text.Text at 0x22bc59c18>



In [136]:

    
print("methylation diference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell,  CLL pairs")









    



methylation diference vs. total # unique CpGs per cell, CLL pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[136]:





<matplotlib.text.Text at 0x22dbf50b8>



In [137]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.911536366763



In [ ]:



In [138]:

    
#
# normal
#
y = normal_pairs.methylation_difference # dependent variable
X = normal_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(3045, 4)






    Out[138]:





OLS Regression Results

  Dep. Variable:     methylation_difference    R-squared:              0.003 


  Model:                       OLS             Adj. R-squared:         0.002 


  Method:                 Least Squares        F-statistic:            2.827 


  Date:                 Tue, 09 Aug 2016       Prob (F-statistic):    0.0372  


  Time:                     18:40:45           Log-Likelihood:        6693.1 


  No. Observations:            3045            AIC:                -1.338e+04


  Df Residuals:                3041            BIC:                -1.335e+04


  Df Model:                       3                                          


  Covariance Type:          nonrobust                                        




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.0540      0.035      1.530   0.126     -0.015     0.123


  total_cpg_no_filter  -7.466e-10   2.61e-09     -0.286   0.775  -5.87e-09  4.37e-09


  bsRate_mean             -0.0290      0.033     -0.885   0.376     -0.093     0.035


  avgReadCpG_mean          0.0015      0.001      1.979   0.048   1.42e-05     0.003




  Omnibus:        404.716    Durbin-Watson:         1.467 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    585.076 


  Skew:            0.997     Prob(JB):           8.96e-128


  Kurtosis:        3.798     Cond. No.           4.39e+07



In [139]:

    
print("methylation_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal pairs ")









    



methylation_difference vs. bsRate, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[139]:





<matplotlib.text.Text at 0x11bb8e048>



In [140]:

    
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")









    



methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[140]:





<matplotlib.text.Text at 0x11c0f2c50>



In [141]:

    
print("methylation diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, Normal pairs")









    



methylation diference vs. total # unique CpGs per cell, Normal pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[141]:





<matplotlib.text.Text at 0x11c41b080>



In [142]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.888721635344



In [145]:

    
X









    Out[145]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
    
  
  
    
      0
      1
      416763.5
      0.997315
      6.891204
    
    
      1
      1
      309204.0
      0.997480
      6.972952
    
    
      2
      1
      339687.5
      0.997430
      6.847710
    
    
      3
      1
      408786.5
      0.997382
      6.851644
    
    
      4
      1
      384508.5
      0.997391
      6.858078
    
    
      5
      1
      396657.0
      0.997365
      6.858142
    
    
      6
      1
      338167.0
      0.997429
      6.851672
    
    
      7
      1
      344211.5
      0.997454
      6.881279
    
    
      8
      1
      425468.5
      0.997407
      6.827551
    
    
      9
      1
      342130.5
      0.997368
      6.860331
    
    
      10
      1
      321270.0
      0.997328
      6.905727
    
    
      11
      1
      282875.5
      0.997340
      6.879531
    
    
      12
      1
      294697.0
      0.997364
      6.978425
    
    
      13
      1
      360588.5
      0.997415
      6.895013
    
    
      14
      1
      380398.0
      0.997408
      6.905550
    
    
      15
      1
      301601.5
      0.997372
      6.937971
    
    
      16
      1
      322817.5
      0.997380
      7.015534
    
    
      17
      1
      353301.0
      0.997330
      6.890292
    
    
      18
      1
      422400.0
      0.997282
      6.894226
    
    
      19
      1
      398122.0
      0.997291
      6.900660
    
    
      20
      1
      410270.5
      0.997265
      6.900724
    
    
      21
      1
      351780.5
      0.997329
      6.894254
    
    
      22
      1
      357825.0
      0.997354
      6.923861
    
    
      23
      1
      439082.0
      0.997307
      6.870133
    
    
      24
      1
      355744.0
      0.997268
      6.902913
    
    
      25
      1
      334883.5
      0.997228
      6.948309
    
    
      26
      1
      296489.0
      0.997240
      6.922113
    
    
      27
      1
      308310.5
      0.997264
      7.021007
    
    
      28
      1
      374202.0
      0.997315
      6.937595
    
    
      29
      1
      394011.5
      0.997308
      6.948132
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      75
      1
      535320.0
      0.961082
      6.990924
    
    
      76
      1
      497376.5
      0.960889
      6.977585
    
    
      77
      1
      507416.5
      0.961103
      6.972817
    
    
      78
      1
      413178.5
      0.961120
      6.937515
    
    
      79
      1
      296916.5
      0.960838
      7.015553
    
    
      80
      1
      382519.0
      0.961078
      6.983289
    
    
      81
      1
      390752.0
      0.961159
      6.967151
    
    
      82
      1
      423436.0
      0.961255
      6.984978
    
    
      83
      1
      385492.5
      0.961062
      6.971639
    
    
      84
      1
      544249.0
      0.961015
      6.936375
    
    
      85
      1
      427987.0
      0.960732
      7.014413
    
    
      86
      1
      513589.5
      0.960973
      6.982149
    
    
      87
      1
      521822.5
      0.961054
      6.966011
    
    
      88
      1
      554506.5
      0.961150
      6.983837
    
    
      89
      1
      516563.0
      0.960956
      6.970499
    
    
      90
      1
      333749.0
      0.960749
      6.979111
    
    
      91
      1
      419351.5
      0.960990
      6.946847
    
    
      92
      1
      427584.5
      0.961071
      6.930709
    
    
      93
      1
      460268.5
      0.961167
      6.948535
    
    
      94
      1
      422325.0
      0.960974
      6.935197
    
    
      95
      1
      303089.5
      0.960707
      7.024885
    
    
      96
      1
      311322.5
      0.960788
      7.008748
    
    
      97
      1
      344006.5
      0.960884
      7.026574
    
    
      98
      1
      306063.0
      0.960691
      7.013236
    
    
      99
      1
      396925.0
      0.961029
      6.976484
    
    
      100
      1
      429609.0
      0.961125
      6.994310
    
    
      101
      1
      391665.5
      0.960932
      6.980971
    
    
      102
      1
      437842.0
      0.961206
      6.978172
    
    
      103
      1
      399898.5
      0.961013
      6.964834
    
    
      104
      1
      432582.5
      0.961109
      6.982660
    
  

3045 rows × 4 columns



In [ ]:



In [ ]:

	filename	methylation	PDR_total	methylation_unweighted	PDR_unweighted	thisMeth	mixedReadCount	total_reads	type	bio	protocol	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	bsRate	avgReadCpgs_nofilter	avgReadCpgs_lessthan1CpG	avgReadCpgs_gtreql3.8CpG
0	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC	0.529505	0.235795	0.632802	0.231878	2208325.0	983394.0	4170549.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	525282.0	525251.0	435636.0	0.9975	5.354284	5.355660	7.019255
1	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG	0.455550	0.177631	0.583859	0.175371	733064.0	285841.0	1609185.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	221972.0	221962.0	186757.0	0.9975	5.587294	5.588449	7.302612
2	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG	0.515269	0.177645	0.618578	0.174221	1452802.0	500870.0	2819500.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	355730.0	355713.0	295624.0	0.9975	5.393199	5.394331	7.079288
3	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG	0.556175	0.176367	0.652727	0.172273	2279354.0	722800.0	4098270.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	483179.0	483150.0	397812.0	0.9975	5.287116	5.288477	6.979525
4	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC	0.528642	0.181331	0.640401	0.172287	1394208.0	478231.0	2637340.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	356122.0	356100.0	294065.0	0.9975	5.314302	5.315719	6.995052

Dep. Variable:	methylation_difference	R-squared:	0.051
Model:	OLS	Adj. R-squared:	0.037
Method:	Least Squares	F-statistic:	3.694
Date:	Tue, 09 Aug 2016	Prob (F-statistic):	0.0127
Time:	18:36:00	Log-Likelihood:	634.72
No. Observations:	210	AIC:	-1261.
Df Residuals:	206	BIC:	-1248.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	-30.3253	11.387	-2.663	0.008	-52.775 -7.875
total_cpg_no_filter	-3.279e-08	1.12e-08	-2.915	0.004	-5.5e-08 -1.06e-08
bsRate_mean	30.6098	11.611	2.636	0.009	7.719 53.501
avgReadCpG_mean	0.0516	0.029	1.793	0.075	-0.005 0.108

Omnibus:	35.998	Durbin-Watson:	2.175
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.728
Skew:	0.253	Prob(JB):	0.00468
Kurtosis:	2.015	Cond. No.	1.46e+10

Omnibus:	24.923	Durbin-Watson:	2.253
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.923
Skew:	0.135	Prob(JB):	0.0190
Kurtosis:	2.088	Cond. No.	1.68e+10

Omnibus:	8.958	Durbin-Watson:	1.474
Prob(Omnibus):	0.011	Jarque-Bera (JB):	12.260
Skew:	0.301	Prob(JB):	0.00218
Kurtosis:	4.089	Cond. No.	3.27e+09

Omnibus:	3.281	Durbin-Watson:	2.393
Prob(Omnibus):	0.194	Jarque-Bera (JB):	2.603
Skew:	-0.160	Prob(JB):	0.272
Kurtosis:	2.524	Cond. No.	4.47e+09

Omnibus:	7.301	Durbin-Watson:	1.539
Prob(Omnibus):	0.026	Jarque-Bera (JB):	6.896
Skew:	0.411	Prob(JB):	0.0318
Kurtosis:	2.557	Cond. No.	6.96e+09

Omnibus:	3.415	Durbin-Watson:	1.088
Prob(Omnibus):	0.181	Jarque-Bera (JB):	3.118
Skew:	0.293	Prob(JB):	0.210
Kurtosis:	2.545	Cond. No.	7.56e+09

Omnibus:	8.090	Durbin-Watson:	1.492
Prob(Omnibus):	0.018	Jarque-Bera (JB):	8.393
Skew:	0.465	Prob(JB):	0.0151
Kurtosis:	2.921	Cond. No.	1.03e+10

Omnibus:	4.895	Durbin-Watson:	1.943
Prob(Omnibus):	0.086	Jarque-Bera (JB):	3.450
Skew:	-0.676	Prob(JB):	0.178
Kurtosis:	3.687	Cond. No.	3.29e+10

Omnibus:	13.771	Durbin-Watson:	1.488
Prob(Omnibus):	0.001	Jarque-Bera (JB):	15.195
Skew:	0.693	Prob(JB):	0.000502
Kurtosis:	3.023	Cond. No.	8.48e+09

Omnibus:	1.604	Durbin-Watson:	1.420
Prob(Omnibus):	0.448	Jarque-Bera (JB):	1.661
Skew:	0.201	Prob(JB):	0.436
Kurtosis:	2.685	Cond. No.	1.52e+10

Omnibus:	6.175	Durbin-Watson:	1.448
Prob(Omnibus):	0.046	Jarque-Bera (JB):	6.310
Skew:	0.494	Prob(JB):	0.0426
Kurtosis:	2.888	Cond. No.	1.55e+10

Omnibus:	3.749	Durbin-Watson:	1.332
Prob(Omnibus):	0.153	Jarque-Bera (JB):	3.798
Skew:	0.389	Prob(JB):	0.150
Kurtosis:	2.745	Cond. No.	1.28e+10

Omnibus:	3.871	Durbin-Watson:	2.144
Prob(Omnibus):	0.144	Jarque-Bera (JB):	3.833
Skew:	0.290	Prob(JB):	0.147
Kurtosis:	2.682	Cond. No.	3.13e+10

Omnibus:	20.949	Durbin-Watson:	1.419
Prob(Omnibus):	0.000	Jarque-Bera (JB):	24.161
Skew:	0.811	Prob(JB):	5.67e-06
Kurtosis:	3.365	Cond. No.	1.48e+10

Omnibus:	15.542	Durbin-Watson:	1.021
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.858
Skew:	0.702	Prob(JB):	0.000218
Kurtosis:	3.401	Cond. No.	1.63e+10

Omnibus:	12.686	Durbin-Watson:	1.478
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.548
Skew:	0.592	Prob(JB):	0.00114
Kurtosis:	3.068	Cond. No.	2.98e+10

Omnibus:	9.227	Durbin-Watson:	1.561
Prob(Omnibus):	0.010	Jarque-Bera (JB):	9.799
Skew:	0.552	Prob(JB):	0.00745
Kurtosis:	2.857	Cond. No.	1.62e+10

Omnibus:	13.706	Durbin-Watson:	1.123
Prob(Omnibus):	0.001	Jarque-Bera (JB):	15.393
Skew:	0.734	Prob(JB):	0.000454
Kurtosis:	2.930	Cond. No.	9.65e+08

Omnibus:	11.696	Durbin-Watson:	1.748
Prob(Omnibus):	0.003	Jarque-Bera (JB):	12.109
Skew:	0.583	Prob(JB):	0.00235
Kurtosis:	2.590	Cond. No.	4.47e+09

Omnibus:	12.063	Durbin-Watson:	1.344
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.236
Skew:	0.666	Prob(JB):	0.00134
Kurtosis:	2.713	Cond. No.	7.17e+09

Omnibus:	11.347	Durbin-Watson:	1.718
Prob(Omnibus):	0.003	Jarque-Bera (JB):	12.434
Skew:	0.653	Prob(JB):	0.00200
Kurtosis:	2.802	Cond. No.	5.35e+09

Omnibus:	9.120	Durbin-Watson:	1.943
Prob(Omnibus):	0.010	Jarque-Bera (JB):	9.711
Skew:	0.571	Prob(JB):	0.00778
Kurtosis:	2.761	Cond. No.	4.19e+09

Omnibus:	8.251	Durbin-Watson:	1.945
Prob(Omnibus):	0.016	Jarque-Bera (JB):	8.349
Skew:	0.689	Prob(JB):	0.0154
Kurtosis:	3.107	Cond. No.	6.03e+09

Omnibus:	578.877	Durbin-Watson:	1.457
Prob(Omnibus):	0.000	Jarque-Bera (JB):	883.294
Skew:	1.021	Prob(JB):	1.57e-192
Kurtosis:	4.043	Cond. No.	5.10e+07

	const	total_cpg_no_filter	bsRate_mean	avgReadCpG_mean	type_CLL
0	1	709541.5	0.980181	7.029760	1.0
1	1	786284.0	0.980293	7.017405	1.0
2	1	683623.5	0.980268	7.039100	1.0
3	1	853200.5	0.980262	7.028117	1.0
4	1	842686.5	0.980228	7.025451	1.0

Omnibus:	76.465	Durbin-Watson:	1.679
Prob(Omnibus):	0.000	Jarque-Bera (JB):	98.596
Skew:	0.659	Prob(JB):	3.89e-22
Kurtosis:	3.810	Cond. No.	1.66e+08

Omnibus:	404.716	Durbin-Watson:	1.467
Prob(Omnibus):	0.000	Jarque-Bera (JB):	585.076
Skew:	0.997	Prob(JB):	8.96e-128
Kurtosis:	3.798	Cond. No.	4.39e+07