notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm



In [4]:

    
stats = pd.read_csv("all_RRBS_statistics_final.csv")



In [5]:

    
stats.shape









    Out[5]:





(438, 15)



In [6]:

    
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]



In [7]:

    
len(normal)









    Out[7]:





336



In [8]:

    
len(CLL)









    Out[8]:





102



In [9]:

    
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")



In [10]:

    
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)



In [11]:

    
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])



In [12]:

    
cpg_total.shape









    Out[12]:





(513, 4)



In [13]:

    
merged = stats.merge(cpg_total, on="filename")



In [14]:

    
merged.shape









    Out[14]:





(438, 18)



In [15]:

    
merged.head()









    Out[15]:






  
    
      
      filename
      methylation
      PDR_total
      methylation_unweighted
      PDR_unweighted
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      bsRate
      avgReadCpgs_nofilter
      avgReadCpgs_lessthan1CpG
      avgReadCpgs_gtreql3.8CpG
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC
      0.529505
      0.235795
      0.632802
      0.231878
      2208325.0
      983394.0
      4170549.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      525282.0
      525251.0
      435636.0
      0.9975
      5.354284
      5.355660
      7.019255
    
    
      1
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG
      0.455550
      0.177631
      0.583859
      0.175371
      733064.0
      285841.0
      1609185.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      221972.0
      221962.0
      186757.0
      0.9975
      5.587294
      5.588449
      7.302612
    
    
      2
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG
      0.515269
      0.177645
      0.618578
      0.174221
      1452802.0
      500870.0
      2819500.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      355730.0
      355713.0
      295624.0
      0.9975
      5.393199
      5.394331
      7.079288
    
    
      3
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG
      0.556175
      0.176367
      0.652727
      0.172273
      2279354.0
      722800.0
      4098270.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      483179.0
      483150.0
      397812.0
      0.9975
      5.287116
      5.288477
      6.979525
    
    
      4
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC
      0.528642
      0.181331
      0.640401
      0.172287
      1394208.0
      478231.0
      2637340.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      356122.0
      356100.0
      294065.0
      0.9975
      5.314302
      5.315719
      6.995052



In [ ]:



In [ ]:



In [16]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_total, tritopool.PDR_total)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)

pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})


y = pairs1.PDR_difference # dependent variable to predict

X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[16]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.015


  Model:                    OLS          Adj. R-squared:        0.001


  Method:              Least Squares     F-statistic:           1.038


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.377 


  Time:                  00:35:47        Log-Likelihood:       826.52


  No. Observations:          210         AIC:                  -1645.


  Df Residuals:              206         BIC:                  -1632.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.0408      4.568      0.009   0.993     -8.966     9.048


  total_cpg_no_filter  -5.724e-09   4.51e-09     -1.268   0.206  -1.46e-08  3.17e-09


  bsRate_mean             -0.0226      4.658     -0.005   0.996     -9.206     9.161


  avgReadCpG_mean         -0.0011      0.012     -0.094   0.925     -0.024     0.022




  Omnibus:        14.287    Durbin-Watson:         1.921


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     14.954


  Skew:            0.618    Prob(JB):           0.000566


  Kurtosis:        2.575    Cond. No.           1.46e+10



In [17]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[17]:





<matplotlib.text.Text at 0x107628e80>



In [18]:

    
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.text.Text at 0x1079844e0>



In [19]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[19]:





<matplotlib.text.Text at 0x1083edb70>



In [20]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_total, tritopool.PDR_total)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})


y = pairs1a.PDR_difference # dependent variable to predict

X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[20]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.045


  Model:                    OLS          Adj. R-squared:        0.031


  Method:              Least Squares     F-statistic:           3.260


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   0.0225 


  Time:                  00:35:52        Log-Likelihood:       801.55


  No. Observations:          210         AIC:                  -1595.


  Df Residuals:              206         BIC:                  -1582.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.2956      5.970      0.217   0.828    -10.475    13.066


  total_cpg_no_filter   1.307e-08   4.75e-09      2.749   0.007    3.7e-09  2.24e-08


  bsRate_mean             -1.3146      6.192     -0.212   0.832    -13.522    10.892


  avgReadCpG_mean         -0.0020      0.015     -0.135   0.893     -0.032     0.028




  Omnibus:        12.567    Durbin-Watson:         2.501


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     10.852


  Skew:            0.478    Prob(JB):            0.00440


  Kurtosis:        2.428    Cond. No.           1.68e+10



In [21]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x10873f748>



In [22]:

    
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x10896d2e8>



In [23]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x108c7c9e8>



In [ ]:



In [24]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2.PDR_difference # dependent variable to predict

X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[24]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.025


  Model:                    OLS          Adj. R-squared:        0.009


  Method:              Least Squares     F-statistic:           1.571


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.198 


  Time:                  00:35:55        Log-Likelihood:       719.16


  No. Observations:          190         AIC:                  -1430.


  Df Residuals:              186         BIC:                  -1417.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.5147      2.434      0.622   0.535     -3.287     6.317


  total_cpg_no_filter  -8.124e-09   4.13e-09     -1.969   0.050  -1.63e-08  1.44e-11


  bsRate_mean             -1.5880      2.548     -0.623   0.534     -6.615     3.439


  avgReadCpG_mean          0.0032      0.008      0.411   0.682     -0.012     0.018




  Omnibus:        14.219    Durbin-Watson:         2.170


  Prob(Omnibus):   0.001    Jarque-Bera (JB):      8.815


  Skew:            0.378    Prob(JB):             0.0122


  Kurtosis:        2.264    Cond. No.           3.27e+09



In [25]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[25]:





<matplotlib.text.Text at 0x1087241d0>



In [26]:

    
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x1091a17f0>



In [27]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x1093e6438>



In [28]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2a.PDR_difference # dependent variable to predict

X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[28]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.060


  Model:                    OLS          Adj. R-squared:        0.045


  Method:              Least Squares     F-statistic:           3.934


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   0.00942


  Time:                  00:35:58        Log-Likelihood:       702.58


  No. Observations:          190         AIC:                  -1397.


  Df Residuals:              186         BIC:                  -1384.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -8.6575      2.946     -2.939   0.004    -14.469    -2.846


  total_cpg_no_filter  -5.206e-10   3.64e-09     -0.143   0.886   -7.7e-09  6.66e-09


  bsRate_mean              9.0310      3.091      2.922   0.004      2.933    15.129


  avgReadCpG_mean         -0.0016      0.006     -0.270   0.788     -0.014     0.010




  Omnibus:         9.844    Durbin-Watson:         1.839


  Prob(Omnibus):   0.007    Jarque-Bera (JB):      8.185


  Skew:            0.420    Prob(JB):             0.0167


  Kurtosis:        2.427    Cond. No.           4.47e+09



In [29]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x1096935c0>



In [30]:

    
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x10986be80>



In [31]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x109aa63c8>



In [ ]:



In [ ]:



In [32]:

    
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2b.PDR_difference # dependent variable to predict

X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[32]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.089


  Model:                    OLS          Adj. R-squared:        0.074


  Method:              Least Squares     F-statistic:           6.049


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  0.000596


  Time:                  00:36:01        Log-Likelihood:       651.12


  No. Observations:          190         AIC:                  -1294.


  Df Residuals:              186         BIC:                  -1281.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -3.8803      4.876     -0.796   0.427    -13.500     5.739


  total_cpg_no_filter   -2.13e-08   6.06e-09     -3.515   0.001  -3.33e-08 -9.34e-09


  bsRate_mean              3.8863      5.055      0.769   0.443     -6.087    13.859


  avgReadCpG_mean          0.0224      0.016      1.362   0.175     -0.010     0.055




  Omnibus:        10.262    Durbin-Watson:         1.833


  Prob(Omnibus):   0.006    Jarque-Bera (JB):     11.040


  Skew:            0.582    Prob(JB):            0.00401


  Kurtosis:        2.805    Cond. No.           6.96e+09



In [33]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x109e0cf98>



In [34]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x10a01ae80>



In [35]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x10a24d748>



In [ ]:



In [36]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3.PDR_difference # dependent variable to predict

X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[36]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.030


  Model:                    OLS          Adj. R-squared:        0.008


  Method:              Least Squares     F-statistic:           1.357


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.259 


  Time:                  00:36:04        Log-Likelihood:       247.33


  No. Observations:          136         AIC:                  -486.7


  Df Residuals:              132         BIC:                  -475.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -30.6966     57.991     -0.529   0.597   -145.409    84.015


  total_cpg_no_filter  -1.001e-07   7.26e-08     -1.378   0.170  -2.44e-07  4.35e-08


  bsRate_mean             31.8723     58.137      0.548   0.584    -83.129   146.874


  avgReadCpG_mean         -0.1449      0.075     -1.923   0.057     -0.294     0.004




  Omnibus:         9.591    Durbin-Watson:         1.827


  Prob(Omnibus):   0.008    Jarque-Bera (JB):     10.368


  Skew:            0.671    Prob(JB):            0.00561


  Kurtosis:        2.830    Cond. No.           7.56e+09



In [37]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x10a4a8cc0>



In [38]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x10a7142e8>



In [39]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x10aa1a588>



In [ ]:



In [40]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3a.PDR_difference # dependent variable to predict

X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[40]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.081


  Model:                    OLS          Adj. R-squared:        0.069


  Method:              Least Squares     F-statistic:           6.678


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  0.000244


  Time:                  00:36:07        Log-Likelihood:       520.12


  No. Observations:          231         AIC:                  -1032.


  Df Residuals:              227         BIC:                  -1018.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -67.7274     42.852     -1.580   0.115   -152.167    16.712


  total_cpg_no_filter   7.637e-08   3.29e-08      2.322   0.021   1.16e-08  1.41e-07


  bsRate_mean             67.2942     42.926      1.568   0.118    -17.290   151.879


  avgReadCpG_mean          0.0865      0.020      4.340   0.000      0.047     0.126




  Omnibus:        39.953    Durbin-Watson:         1.261


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     55.176


  Skew:            1.099    Prob(JB):           1.04e-12


  Kurtosis:        3.947    Cond. No.           1.03e+10



In [41]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[41]:





<matplotlib.text.Text at 0x10ad586a0>



In [42]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x10afae4a8>



In [43]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x10b1dd4e0>



In [ ]:



In [44]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3b.PDR_difference # dependent variable to predict

X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)



print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[44]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.169


  Model:                    OLS          Adj. R-squared:        0.091


  Method:              Least Squares     F-statistic:           2.174


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.110 


  Time:                  00:36:10        Log-Likelihood:       77.381


  No. Observations:           36         AIC:                  -146.8


  Df Residuals:               32         BIC:                  -140.4


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -204.2335    245.012     -0.834   0.411   -703.306   294.839


  total_cpg_no_filter  -1.981e-08   3.49e-08     -0.567   0.575   -9.1e-08  5.14e-08


  bsRate_mean            203.9084    245.166      0.832   0.412   -295.478   703.295


  avgReadCpG_mean          0.1252      0.079      1.594   0.121     -0.035     0.285




  Omnibus:         1.836    Durbin-Watson:         1.647


  Prob(Omnibus):   0.399    Jarque-Bera (JB):      1.718


  Skew:            0.484    Prob(JB):              0.424


  Kurtosis:        2.541    Cond. No.           3.29e+10



In [45]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[45]:





<matplotlib.text.Text at 0x10b50c940>



In [46]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x10b736cf8>



In [47]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x10b912080>



In [ ]:



In [ ]:



In [48]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3c.PDR_difference # dependent variable to predict

X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[48]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.006


  Model:                    OLS          Adj. R-squared:       -0.010


  Method:              Least Squares     F-statistic:          0.4045


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.750 


  Time:                  00:36:14        Log-Likelihood:       397.02


  No. Observations:          190         AIC:                  -786.0


  Df Residuals:              186         BIC:                  -773.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -13.3122     46.675     -0.285   0.776   -105.392    78.768


  total_cpg_no_filter   3.897e-08   3.84e-08      1.015   0.311  -3.68e-08  1.15e-07


  bsRate_mean             13.2619     46.664      0.284   0.777    -78.796   105.320


  avgReadCpG_mean          0.0174      0.050      0.348   0.728     -0.081     0.116




  Omnibus:        12.462    Durbin-Watson:         1.572


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     13.739


  Skew:            0.650    Prob(JB):            0.00104


  Kurtosis:        2.788    Cond. No.           8.48e+09



In [49]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.text.Text at 0x10bc3ea90>



In [50]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x10be7cc88>



In [51]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x10c1aeb38>



In [ ]:



In [ ]:



In [52]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4.PDR_difference # dependent variable to predict

X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[52]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.249


  Model:                    OLS          Adj. R-squared:        0.234


  Method:              Least Squares     F-statistic:           16.45


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  2.77e-09


  Time:                  00:36:17        Log-Likelihood:       435.37


  No. Observations:          153         AIC:                  -862.7


  Df Residuals:              149         BIC:                  -850.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  233.6229     38.347      6.092   0.000    157.850   309.396


  total_cpg_no_filter  -6.274e-08   2.11e-08     -2.969   0.003  -1.04e-07  -2.1e-08


  bsRate_mean           -234.6229     38.536     -6.088   0.000   -310.771  -158.475


  avgReadCpG_mean          0.0566      0.022      2.548   0.012      0.013     0.101




  Omnibus:        50.820    Durbin-Watson:         1.593


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     94.868


  Skew:            1.586    Prob(JB):           2.51e-21


  Kurtosis:        5.197    Cond. No.           1.52e+10



In [53]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[53]:





<matplotlib.text.Text at 0x10c3fde10>



In [54]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x10c69a208>



In [55]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x10bc30630>



In [ ]:



In [56]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4a.PDR_difference # dependent variable to predict

X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[56]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.107


  Model:                    OLS          Adj. R-squared:        0.089


  Method:              Least Squares     F-statistic:           5.958


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  0.000729


  Time:                  00:36:21        Log-Likelihood:       383.38


  No. Observations:          153         AIC:                  -758.8


  Df Residuals:              149         BIC:                  -746.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -16.9739     60.736     -0.279   0.780   -136.989   103.041


  total_cpg_no_filter   1.259e-07   4.07e-08      3.098   0.002   4.56e-08  2.06e-07


  bsRate_mean             17.0169     60.905      0.279   0.780   -103.333   137.366


  avgReadCpG_mean         -0.0038      0.041     -0.093   0.926     -0.085     0.078




  Omnibus:        71.167    Durbin-Watson:         0.230


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    175.960


  Skew:            2.063    Prob(JB):           6.18e-39


  Kurtosis:        6.253    Cond. No.           1.55e+10



In [57]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[57]:





<matplotlib.text.Text at 0x10cc2e0f0>



In [58]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x10ced0ef0>



In [59]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x10d165080>



In [ ]:



In [60]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4b.PDR_difference # dependent variable to predict

X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[60]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.192


  Model:                    OLS          Adj. R-squared:        0.173


  Method:              Least Squares     F-statistic:           10.43


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  3.30e-06


  Time:                  00:36:24        Log-Likelihood:       586.97


  No. Observations:          136         AIC:                  -1166.


  Df Residuals:              132         BIC:                  -1154.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -39.1096      8.655     -4.519   0.000    -56.230   -21.989


  total_cpg_no_filter  -4.442e-10   7.49e-09     -0.059   0.953  -1.53e-08  1.44e-08


  bsRate_mean             39.1438      8.669      4.516   0.000     21.997    56.291


  avgReadCpG_mean          0.0105      0.006      1.870   0.064     -0.001     0.022




  Omnibus:         3.164    Durbin-Watson:         1.788


  Prob(Omnibus):   0.206    Jarque-Bera (JB):      3.203


  Skew:            0.356    Prob(JB):              0.202


  Kurtosis:        2.759    Cond. No.           1.28e+10



In [61]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[61]:





<matplotlib.text.Text at 0x10d525358>



In [62]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x10d74dc50>



In [63]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x10da66ac8>



In [ ]:



In [64]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4c.PDR_difference # dependent variable to predict

X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[64]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.249


  Model:                    OLS          Adj. R-squared:        0.238


  Method:              Least Squares     F-statistic:           22.80


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  8.70e-13


  Time:                  00:36:29        Log-Likelihood:       589.43


  No. Observations:          210         AIC:                  -1171.


  Df Residuals:              206         BIC:                  -1157.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -370.7854     65.232     -5.684   0.000   -499.393  -242.178


  total_cpg_no_filter   1.944e-07   2.42e-08      8.049   0.000   1.47e-07  2.42e-07


  bsRate_mean            370.8972     65.333      5.677   0.000    242.090   499.705


  avgReadCpG_mean          0.1069      0.019      5.488   0.000      0.068     0.145




  Omnibus:        63.937    Durbin-Watson:         0.324


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    124.179


  Skew:            1.519    Prob(JB):           1.08e-27


  Kurtosis:        5.228    Cond. No.           3.13e+10



In [65]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[65]:





<matplotlib.text.Text at 0x10dd74588>



In [66]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x10e0aaeb8>



In [67]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x10e33afd0>



In [ ]:



In [68]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5.PDR_difference # dependent variable to predict

X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[68]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.088


  Model:                    OLS          Adj. R-squared:        0.075


  Method:              Least Squares     F-statistic:           6.647


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  0.000263


  Time:                  00:36:33        Log-Likelihood:       335.17


  No. Observations:          210         AIC:                  -662.3


  Df Residuals:              206         BIC:                  -649.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -288.8814     81.380     -3.550   0.000   -449.325  -128.438


  total_cpg_no_filter  -2.483e-08   2.03e-08     -1.222   0.223  -6.49e-08  1.52e-08


  bsRate_mean            290.2301     81.596      3.557   0.000    129.360   451.101


  avgReadCpG_mean         -0.0882      0.055     -1.599   0.111     -0.197     0.021




  Omnibus:        100.127    Durbin-Watson:         1.953


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.496


  Skew:            0.291     Prob(JB):           0.000432


  Kurtosis:        1.803     Cond. No.           1.48e+10



In [69]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[69]:





<matplotlib.text.Text at 0x10e6360f0>



In [70]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x10e87b5f8>



In [71]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x10ec56e80>



In [ ]:



In [72]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5a.PDR_difference # dependent variable to predict

X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[72]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.154


  Model:                    OLS          Adj. R-squared:        0.141


  Method:              Least Squares     F-statistic:           11.31


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  7.52e-07


  Time:                  00:36:37        Log-Likelihood:       358.89


  No. Observations:          190         AIC:                  -709.8


  Df Residuals:              186         BIC:                  -696.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -216.8055     41.260     -5.255   0.000   -298.202  -135.409


  total_cpg_no_filter   9.099e-09   1.21e-08      0.754   0.452  -1.47e-08  3.29e-08


  bsRate_mean            218.0680     41.426      5.264   0.000    136.342   299.794


  avgReadCpG_mean         -0.1064      0.045     -2.384   0.018     -0.194    -0.018




  Omnibus:        16.391    Durbin-Watson:         1.074


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     18.711


  Skew:            0.769    Prob(JB):           8.65e-05


  Kurtosis:        3.024    Cond. No.           1.63e+10



In [73]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[73]:





<matplotlib.text.Text at 0x10eed6860>



In [74]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x10f135e10>



In [75]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x10e5f5a20>



In [ ]:



In [76]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5b.PDR_difference # dependent variable to predict

X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[76]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.247


  Model:                    OLS          Adj. R-squared:        0.237


  Method:              Least Squares     F-statistic:           24.84


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  6.16e-14


  Time:                  00:36:41        Log-Likelihood:       408.74


  No. Observations:          231         AIC:                  -809.5


  Df Residuals:              227         BIC:                  -795.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -772.9205    108.837     -7.102   0.000   -987.380  -558.461


  total_cpg_no_filter   3.368e-08   1.64e-08      2.057   0.041   1.42e-09  6.59e-08


  bsRate_mean            775.5821    109.028      7.114   0.000    560.746   990.418


  avgReadCpG_mean         -0.2198      0.062     -3.520   0.001     -0.343    -0.097




  Omnibus:        16.009    Durbin-Watson:         1.644


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     17.833


  Skew:            0.665    Prob(JB):           0.000134


  Kurtosis:        2.712    Cond. No.           2.98e+10



In [77]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[77]:





<matplotlib.text.Text at 0x10f62d160>



In [78]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x10f887d30>



In [79]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x10fc61c50>



In [ ]:



In [80]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5c.PDR_difference # dependent variable to predict

X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[80]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.057


  Model:                    OLS          Adj. R-squared:        0.041


  Method:              Least Squares     F-statistic:           3.721


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   0.0125 


  Time:                  00:36:45        Log-Likelihood:       324.87


  No. Observations:          190         AIC:                  -641.7


  Df Residuals:              186         BIC:                  -628.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -127.6417     89.834     -1.421   0.157   -304.867    49.583


  total_cpg_no_filter    -2.1e-08   1.73e-08     -1.217   0.225  -5.51e-08  1.31e-08


  bsRate_mean            128.5166     90.054      1.427   0.155    -49.141   306.174


  avgReadCpG_mean         -0.1049      0.044     -2.371   0.019     -0.192    -0.018




  Omnibus:        11.644    Durbin-Watson:         2.479


  Prob(Omnibus):   0.003    Jarque-Bera (JB):      7.458


  Skew:            0.339    Prob(JB):             0.0240


  Kurtosis:        2.306    Cond. No.           1.62e+10



In [81]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[81]:





<matplotlib.text.Text at 0x10feeaef0>



In [82]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x110147fd0>



In [83]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x11045f4a8>



In [ ]:



In [84]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6.PDR_difference # dependent variable to predict

X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[84]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.036


  Model:                    OLS          Adj. R-squared:        0.018


  Method:              Least Squares     F-statistic:           2.050


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.109 


  Time:                  00:36:48        Log-Likelihood:       243.98


  No. Observations:          171         AIC:                  -480.0


  Df Residuals:              167         BIC:                  -467.4


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -10.1752     16.166     -0.629   0.530    -42.092    21.742


  total_cpg_no_filter  -3.394e-08   1.74e-07     -0.195   0.846  -3.78e-07   3.1e-07


  bsRate_mean             12.3989     16.845      0.736   0.463    -20.858    45.656


  avgReadCpG_mean         -0.2361      0.100     -2.355   0.020     -0.434    -0.038




  Omnibus:        24.164    Durbin-Watson:         1.952


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      7.923


  Skew:            0.226    Prob(JB):             0.0190


  Kurtosis:        2.047    Cond. No.           9.65e+08



In [85]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[85]:





<matplotlib.text.Text at 0x1106d9198>



In [86]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x110918c50>



In [87]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x110b55f60>



In [ ]:



In [88]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6a.PDR_difference # dependent variable to predict

X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[88]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.004


  Model:                    OLS          Adj. R-squared:       -0.012


  Method:              Least Squares     F-statistic:          0.2378


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.870 


  Time:                  00:36:51        Log-Likelihood:       251.33


  No. Observations:          190         AIC:                  -494.7


  Df Residuals:              186         BIC:                  -481.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -15.5639     39.169     -0.397   0.692    -92.837    61.709


  total_cpg_no_filter   2.614e-08   5.37e-08      0.487   0.627  -7.98e-08  1.32e-07


  bsRate_mean             16.5309     41.141      0.402   0.688    -64.631    97.693


  avgReadCpG_mean         -0.0348      0.093     -0.376   0.707     -0.217     0.148




  Omnibus:        429.850    Durbin-Watson:         1.987


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.233


  Skew:            0.133     Prob(JB):           0.000299


  Kurtosis:        1.593     Cond. No.           4.47e+09



In [89]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[89]:





<matplotlib.text.Text at 0x110e86828>



In [90]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x11102ed68>



In [91]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x1113f02b0>



In [ ]:



In [92]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6b.PDR_difference # dependent variable to predict

X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)


print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[92]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.022


  Model:                    OLS          Adj. R-squared:        0.004


  Method:              Least Squares     F-statistic:           1.240


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.297 


  Time:                  00:36:54        Log-Likelihood:       237.95


  No. Observations:          171         AIC:                  -467.9


  Df Residuals:              167         BIC:                  -455.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   66.0441     36.511      1.809   0.072     -6.038   138.126


  total_cpg_no_filter   2.588e-08    4.1e-08      0.631   0.529  -5.51e-08  1.07e-07


  bsRate_mean            -69.4730     38.167     -1.820   0.071   -144.824     5.878


  avgReadCpG_mean          0.1250      0.121      1.037   0.301     -0.113     0.363




  Omnibus:        68.122    Durbin-Watson:         1.099


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.264


  Skew:            0.098    Prob(JB):            0.00591


  Kurtosis:        1.816    Cond. No.           7.17e+09



In [93]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[93]:





<matplotlib.text.Text at 0x11165d4a8>



In [94]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[94]:





<matplotlib.text.Text at 0x11189e978>



In [95]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x111ba5e48>



In [ ]:



In [96]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6c.PDR_difference # dependent variable to predict

X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)


print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[96]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.017


  Model:                    OLS          Adj. R-squared:       -0.001


  Method:              Least Squares     F-statistic:          0.9485


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):    0.419 


  Time:                  00:36:57        Log-Likelihood:       245.02


  No. Observations:          171         AIC:                  -482.0


  Df Residuals:              167         BIC:                  -469.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   43.6951     26.400      1.655   0.100     -8.426    95.816


  total_cpg_no_filter    6.78e-09   3.41e-08      0.199   0.842  -6.05e-08   7.4e-08


  bsRate_mean            -45.6397     27.699     -1.648   0.101   -100.326     9.046


  avgReadCpG_mean          0.0375      0.104      0.360   0.719     -0.168     0.243




  Omnibus:        391.613    Durbin-Watson:         2.062


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.748


  Skew:            0.219     Prob(JB):           0.000381


  Kurtosis:        1.579     Cond. No.           5.35e+09



In [97]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[97]:





<matplotlib.text.Text at 0x111eb0e10>



In [98]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.text.Text at 0x1120fae80>



In [99]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x112415048>



In [ ]:



In [100]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6d.PDR_difference # dependent variable to predict

X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[100]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.076


  Model:                    OLS          Adj. R-squared:        0.060


  Method:              Least Squares     F-statistic:           4.598


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   0.00404


  Time:                  00:37:01        Log-Likelihood:       258.18


  No. Observations:          171         AIC:                  -508.4


  Df Residuals:              167         BIC:                  -495.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -41.3736     23.072     -1.793   0.075    -86.925     4.177


  total_cpg_no_filter   1.483e-08   4.77e-08      0.311   0.756  -7.94e-08  1.09e-07


  bsRate_mean             41.4649     24.269      1.709   0.089     -6.449    89.379


  avgReadCpG_mean          0.2303      0.108      2.128   0.035      0.017     0.444




  Omnibus:        37.886    Durbin-Watson:         1.943


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.655


  Skew:            0.314    Prob(JB):            0.00486


  Kurtosis:        1.951    Cond. No.           4.19e+09



In [101]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[101]:





<matplotlib.text.Text at 0x112743a90>



In [102]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[102]:





<matplotlib.text.Text at 0x112963da0>



In [103]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[103]:





<matplotlib.text.Text at 0x112c8a2b0>



In [ ]:



In [104]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6e.PDR_difference # dependent variable to predict

X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)


print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[104]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.192


  Model:                    OLS          Adj. R-squared:        0.168


  Method:              Least Squares     F-statistic:           8.001


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  7.75e-05


  Time:                  00:37:05        Log-Likelihood:       160.67


  No. Observations:          105         AIC:                  -313.3


  Df Residuals:              101         BIC:                  -302.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  116.9591     47.402      2.467   0.015     22.927   210.992


  total_cpg_no_filter   1.576e-07   6.91e-08      2.280   0.025   2.05e-08  2.95e-07


  bsRate_mean           -116.3707     48.869     -2.381   0.019   -213.314   -19.428


  avgReadCpG_mean         -0.7359      0.195     -3.776   0.000     -1.122    -0.349




  Omnibus:        12.673    Durbin-Watson:         1.403


  Prob(Omnibus):   0.002    Jarque-Bera (JB):      6.567


  Skew:            0.419    Prob(JB):             0.0375


  Kurtosis:        2.106    Cond. No.           6.03e+09



In [105]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[105]:





<matplotlib.text.Text at 0x112eea278>



In [106]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[106]:





<matplotlib.text.Text at 0x1130f3b00>



In [107]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[107]:





<matplotlib.text.Text at 0x11165f2e8>



In [108]:

    
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)









    



(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)



In [109]:

    
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]



In [110]:

    
total_pairs = pd.concat(frames)



In [111]:

    
total_pairs.shape









    Out[111]:





(4035, 15)



In [112]:

    
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)



In [113]:

    
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_total")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_total






    Out[113]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:              0.311 


  Model:                    OLS          Adj. R-squared:         0.310 


  Method:              Least Squares     F-statistic:            454.8 


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   4.94e-324


  Time:                  00:37:09        Log-Likelihood:        7222.7 


  No. Observations:         4035         AIC:                -1.444e+04


  Df Residuals:             4030         BIC:                -1.440e+04


  Df Model:                    4                                       


  Covariance Type:       nonrobust                                     




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.5827      0.049     32.365   0.000      1.487     1.679


  total_cpg_no_filter   3.996e-09   3.35e-09      1.193   0.233  -2.57e-09  1.06e-08


  bsRate_mean             -1.4618      0.046    -31.952   0.000     -1.551    -1.372


  avgReadCpG_mean         -0.0142      0.001    -12.779   0.000     -0.016    -0.012


  type_CLL                -0.0592      0.002    -33.812   0.000     -0.063    -0.056




  Omnibus:        363.360    Durbin-Watson:         1.604 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    469.482 


  Skew:            0.786     Prob(JB):           1.13e-102


  Kurtosis:        3.568     Cond. No.           5.10e+07



In [114]:

    
X.head()









    Out[114]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
      type_CLL
    
  
  
    
      0
      1
      709541.5
      0.980181
      7.029760
      1.0
    
    
      1
      1
      786284.0
      0.980293
      7.017405
      1.0
    
    
      2
      1
      683623.5
      0.980268
      7.039100
      1.0
    
    
      3
      1
      853200.5
      0.980262
      7.028117
      1.0
    
    
      4
      1
      842686.5
      0.980228
      7.025451
      1.0



In [115]:

    
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



PDR_difference vs. bsRate, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[115]:





<matplotlib.text.Text at 0x113501e48>



In [116]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[116]:





<matplotlib.text.Text at 0x1137eda90>



In [117]:

    
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[117]:





<matplotlib.text.Text at 0x113b01f28>



In [118]:

    
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")









    



PDR_difference vs. bsRate, by type! 






    Out[118]:





<matplotlib.text.Text at 0x113e04780>



In [119]:

    
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. avgReadCpG per cell, by type! 






    Out[119]:





<matplotlib.text.Text at 0x113e7b240>



In [120]:

    
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. total unique CpG per cel, by type!






    Out[120]:





<matplotlib.text.Text at 0x113f6c860>



In [121]:

    
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.898775415993



In [122]:

    
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)









    



(990, 15)
(3045, 15)



In [123]:

    
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(990, 4)






    Out[123]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.030


  Model:                    OLS          Adj. R-squared:        0.027


  Method:              Least Squares     F-statistic:           10.25


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):  1.20e-06


  Time:                  00:39:48        Log-Likelihood:       3620.6


  No. Observations:          990         AIC:                  -7233.


  Df Residuals:              986         BIC:                  -7214.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.1695      0.042      4.035   0.000      0.087     0.252


  total_cpg_no_filter   6.446e-10   1.69e-09      0.382   0.702  -2.67e-09  3.96e-09


  bsRate_mean             -0.1460      0.038     -3.824   0.000     -0.221    -0.071


  avgReadCpG_mean         -0.0028      0.003     -1.024   0.306     -0.008     0.003




  Omnibus:        90.328    Durbin-Watson:         1.969


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    113.391


  Skew:            0.805    Prob(JB):           2.38e-25


  Kurtosis:        3.396    Cond. No.           1.66e+08



In [124]:

    
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  CLL pairs ")









    



PDR_difference vs. bsRate, jointplot, CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[124]:





<matplotlib.text.Text at 0x1369745f8>



In [125]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[125]:





<matplotlib.text.Text at 0x2262410f0>



In [126]:

    
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell,  CLL pairs")









    



PDR difference vs. total # unique CpGs per cell, CLL pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[126]:





<matplotlib.text.Text at 0x1ea12c160>



In [127]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.876295293014



In [ ]:



In [128]:

    
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)


X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(3045, 4)






    Out[128]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:              0.225 


  Model:                    OLS          Adj. R-squared:         0.224 


  Method:              Least Squares     F-statistic:            293.7 


  Date:              Wed, 10 Aug 2016    Prob (F-statistic):   2.00e-167


  Time:                  00:40:36        Log-Likelihood:        5063.2 


  No. Observations:         3045         AIC:                -1.012e+04


  Df Residuals:             3041         BIC:                -1.009e+04


  Df Model:                    3                                       


  Covariance Type:       nonrobust                                     




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.7697      0.060     29.333   0.000      1.651     1.888


  total_cpg_no_filter  -1.158e-08   4.46e-09     -2.596   0.009  -2.03e-08 -2.83e-09


  bsRate_mean             -1.6289      0.056    -29.076   0.000     -1.739    -1.519


  avgReadCpG_mean         -0.0167      0.001    -12.778   0.000     -0.019    -0.014




  Omnibus:        177.117    Durbin-Watson:         1.634


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    205.603


  Skew:            0.628     Prob(JB):           2.26e-45


  Kurtosis:        2.798     Cond. No.           4.39e+07



In [129]:

    
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")









    



PDR_difference vs. bsRate, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[129]:





<matplotlib.text.Text at 0x1148a29b0>



In [130]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[130]:





<matplotlib.text.Text at 0x114bd7630>



In [ ]:

    
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")









    



PDR diference vs. total # unique CpGs per cell, Normal pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[ ]:





<matplotlib.text.Text at 0x1150fb9e8>



In [ ]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	filename	methylation	PDR_total	methylation_unweighted	PDR_unweighted	thisMeth	mixedReadCount	total_reads	type	bio	protocol	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	bsRate	avgReadCpgs_nofilter	avgReadCpgs_lessthan1CpG	avgReadCpgs_gtreql3.8CpG
0	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC	0.529505	0.235795	0.632802	0.231878	2208325.0	983394.0	4170549.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	525282.0	525251.0	435636.0	0.9975	5.354284	5.355660	7.019255
1	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG	0.455550	0.177631	0.583859	0.175371	733064.0	285841.0	1609185.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	221972.0	221962.0	186757.0	0.9975	5.587294	5.588449	7.302612
2	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG	0.515269	0.177645	0.618578	0.174221	1452802.0	500870.0	2819500.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	355730.0	355713.0	295624.0	0.9975	5.393199	5.394331	7.079288
3	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG	0.556175	0.176367	0.652727	0.172273	2279354.0	722800.0	4098270.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	483179.0	483150.0	397812.0	0.9975	5.287116	5.288477	6.979525
4	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC	0.528642	0.181331	0.640401	0.172287	1394208.0	478231.0	2637340.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	356122.0	356100.0	294065.0	0.9975	5.314302	5.315719	6.995052

Dep. Variable:	PDR_difference	R-squared:	0.015
Model:	OLS	Adj. R-squared:	0.001
Method:	Least Squares	F-statistic:	1.038
Date:	Wed, 10 Aug 2016	Prob (F-statistic):	0.377
Time:	00:35:47	Log-Likelihood:	826.52
No. Observations:	210	AIC:	-1645.
Df Residuals:	206	BIC:	-1632.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	0.0408	4.568	0.009	0.993	-8.966 9.048
total_cpg_no_filter	-5.724e-09	4.51e-09	-1.268	0.206	-1.46e-08 3.17e-09
bsRate_mean	-0.0226	4.658	-0.005	0.996	-9.206 9.161
avgReadCpG_mean	-0.0011	0.012	-0.094	0.925	-0.024 0.022

Omnibus:	14.287	Durbin-Watson:	1.921
Prob(Omnibus):	0.001	Jarque-Bera (JB):	14.954
Skew:	0.618	Prob(JB):	0.000566
Kurtosis:	2.575	Cond. No.	1.46e+10

Omnibus:	12.567	Durbin-Watson:	2.501
Prob(Omnibus):	0.002	Jarque-Bera (JB):	10.852
Skew:	0.478	Prob(JB):	0.00440
Kurtosis:	2.428	Cond. No.	1.68e+10

Omnibus:	14.219	Durbin-Watson:	2.170
Prob(Omnibus):	0.001	Jarque-Bera (JB):	8.815
Skew:	0.378	Prob(JB):	0.0122
Kurtosis:	2.264	Cond. No.	3.27e+09

Omnibus:	9.844	Durbin-Watson:	1.839
Prob(Omnibus):	0.007	Jarque-Bera (JB):	8.185
Skew:	0.420	Prob(JB):	0.0167
Kurtosis:	2.427	Cond. No.	4.47e+09

Omnibus:	10.262	Durbin-Watson:	1.833
Prob(Omnibus):	0.006	Jarque-Bera (JB):	11.040
Skew:	0.582	Prob(JB):	0.00401
Kurtosis:	2.805	Cond. No.	6.96e+09

Omnibus:	9.591	Durbin-Watson:	1.827
Prob(Omnibus):	0.008	Jarque-Bera (JB):	10.368
Skew:	0.671	Prob(JB):	0.00561
Kurtosis:	2.830	Cond. No.	7.56e+09

Omnibus:	39.953	Durbin-Watson:	1.261
Prob(Omnibus):	0.000	Jarque-Bera (JB):	55.176
Skew:	1.099	Prob(JB):	1.04e-12
Kurtosis:	3.947	Cond. No.	1.03e+10

Omnibus:	1.836	Durbin-Watson:	1.647
Prob(Omnibus):	0.399	Jarque-Bera (JB):	1.718
Skew:	0.484	Prob(JB):	0.424
Kurtosis:	2.541	Cond. No.	3.29e+10

Omnibus:	12.462	Durbin-Watson:	1.572
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.739
Skew:	0.650	Prob(JB):	0.00104
Kurtosis:	2.788	Cond. No.	8.48e+09

Omnibus:	50.820	Durbin-Watson:	1.593
Prob(Omnibus):	0.000	Jarque-Bera (JB):	94.868
Skew:	1.586	Prob(JB):	2.51e-21
Kurtosis:	5.197	Cond. No.	1.52e+10

Omnibus:	71.167	Durbin-Watson:	0.230
Prob(Omnibus):	0.000	Jarque-Bera (JB):	175.960
Skew:	2.063	Prob(JB):	6.18e-39
Kurtosis:	6.253	Cond. No.	1.55e+10

Omnibus:	3.164	Durbin-Watson:	1.788
Prob(Omnibus):	0.206	Jarque-Bera (JB):	3.203
Skew:	0.356	Prob(JB):	0.202
Kurtosis:	2.759	Cond. No.	1.28e+10

Omnibus:	63.937	Durbin-Watson:	0.324
Prob(Omnibus):	0.000	Jarque-Bera (JB):	124.179
Skew:	1.519	Prob(JB):	1.08e-27
Kurtosis:	5.228	Cond. No.	3.13e+10

Omnibus:	100.127	Durbin-Watson:	1.953
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.496
Skew:	0.291	Prob(JB):	0.000432
Kurtosis:	1.803	Cond. No.	1.48e+10

Omnibus:	16.391	Durbin-Watson:	1.074
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.711
Skew:	0.769	Prob(JB):	8.65e-05
Kurtosis:	3.024	Cond. No.	1.63e+10

Omnibus:	16.009	Durbin-Watson:	1.644
Prob(Omnibus):	0.000	Jarque-Bera (JB):	17.833
Skew:	0.665	Prob(JB):	0.000134
Kurtosis:	2.712	Cond. No.	2.98e+10

Omnibus:	11.644	Durbin-Watson:	2.479
Prob(Omnibus):	0.003	Jarque-Bera (JB):	7.458
Skew:	0.339	Prob(JB):	0.0240
Kurtosis:	2.306	Cond. No.	1.62e+10

Omnibus:	24.164	Durbin-Watson:	1.952
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.923
Skew:	0.226	Prob(JB):	0.0190
Kurtosis:	2.047	Cond. No.	9.65e+08

Omnibus:	429.850	Durbin-Watson:	1.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.233
Skew:	0.133	Prob(JB):	0.000299
Kurtosis:	1.593	Cond. No.	4.47e+09

Omnibus:	68.122	Durbin-Watson:	1.099
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.264
Skew:	0.098	Prob(JB):	0.00591
Kurtosis:	1.816	Cond. No.	7.17e+09

Omnibus:	391.613	Durbin-Watson:	2.062
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.748
Skew:	0.219	Prob(JB):	0.000381
Kurtosis:	1.579	Cond. No.	5.35e+09

Omnibus:	37.886	Durbin-Watson:	1.943
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.655
Skew:	0.314	Prob(JB):	0.00486
Kurtosis:	1.951	Cond. No.	4.19e+09

Omnibus:	12.673	Durbin-Watson:	1.403
Prob(Omnibus):	0.002	Jarque-Bera (JB):	6.567
Skew:	0.419	Prob(JB):	0.0375
Kurtosis:	2.106	Cond. No.	6.03e+09

Omnibus:	363.360	Durbin-Watson:	1.604
Prob(Omnibus):	0.000	Jarque-Bera (JB):	469.482
Skew:	0.786	Prob(JB):	1.13e-102
Kurtosis:	3.568	Cond. No.	5.10e+07

	const	total_cpg_no_filter	bsRate_mean	avgReadCpG_mean	type_CLL
0	1	709541.5	0.980181	7.029760	1.0
1	1	786284.0	0.980293	7.017405	1.0
2	1	683623.5	0.980268	7.039100	1.0
3	1	853200.5	0.980262	7.028117	1.0
4	1	842686.5	0.980228	7.025451	1.0

Omnibus:	90.328	Durbin-Watson:	1.969
Prob(Omnibus):	0.000	Jarque-Bera (JB):	113.391
Skew:	0.805	Prob(JB):	2.38e-25
Kurtosis:	3.396	Cond. No.	1.66e+08

Omnibus:	177.117	Durbin-Watson:	1.634
Prob(Omnibus):	0.000	Jarque-Bera (JB):	205.603
Skew:	0.628	Prob(JB):	2.26e-45
Kurtosis:	2.798	Cond. No.	4.39e+07