notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm



In [4]:

    
stats = pd.read_csv("all_RRBS_statistics_final.csv")



In [5]:

    
stats.shape









    Out[5]:





(438, 15)



In [6]:

    
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]



In [7]:

    
len(normal)









    Out[7]:





336



In [8]:

    
len(CLL)









    Out[8]:





102



In [9]:

    
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")



In [10]:

    
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)



In [11]:

    
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])



In [12]:

    
cpg_total.shape









    Out[12]:





(513, 4)



In [13]:

    
merged = stats.merge(cpg_total, on="filename")



In [14]:

    
merged.shape









    Out[14]:





(438, 18)



In [15]:

    
merged.head()









    Out[15]:






  
    
      
      filename
      methylation
      PDR_total
      methylation_unweighted
      PDR_unweighted
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      bsRate
      avgReadCpgs_nofilter
      avgReadCpgs_lessthan1CpG
      avgReadCpgs_gtreql3.8CpG
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC
      0.529505
      0.235795
      0.632802
      0.231878
      2208325.0
      983394.0
      4170549.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      525282.0
      525251.0
      435636.0
      0.9975
      5.354284
      5.355660
      7.019255
    
    
      1
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG
      0.455550
      0.177631
      0.583859
      0.175371
      733064.0
      285841.0
      1609185.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      221972.0
      221962.0
      186757.0
      0.9975
      5.587294
      5.588449
      7.302612
    
    
      2
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG
      0.515269
      0.177645
      0.618578
      0.174221
      1452802.0
      500870.0
      2819500.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      355730.0
      355713.0
      295624.0
      0.9975
      5.393199
      5.394331
      7.079288
    
    
      3
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG
      0.556175
      0.176367
      0.652727
      0.172273
      2279354.0
      722800.0
      4098270.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      483179.0
      483150.0
      397812.0
      0.9975
      5.287116
      5.288477
      6.979525
    
    
      4
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC
      0.528642
      0.181331
      0.640401
      0.172287
      1394208.0
      478231.0
      2637340.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      356122.0
      356100.0
      294065.0
      0.9975
      5.314302
      5.315719
      6.995052



In [ ]:



In [ ]:



In [16]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_total, tritopool.PDR_total)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)

pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1.PDR_difference # dependent variable to predict

X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[16]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.016


  Model:                    OLS          Adj. R-squared:        0.001


  Method:              Least Squares     F-statistic:           1.084


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.357 


  Time:                  22:02:23        Log-Likelihood:       826.59


  No. Observations:          210         AIC:                  -1645.


  Df Residuals:              206         BIC:                  -1632.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.0685      4.560      0.015   0.988     -8.921     9.058


  total_cpg_no_filter  -4.835e-09    5.1e-09     -0.949   0.344  -1.49e-08  5.21e-09


  bsRate_mean             -0.0338      4.652     -0.007   0.994     -9.206     9.138


  avgReadCpG_mean         -0.0047      0.012     -0.380   0.705     -0.029     0.020




  Omnibus:        14.277    Durbin-Watson:         1.921


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     14.938


  Skew:            0.618    Prob(JB):           0.000570


  Kurtosis:        2.575    Cond. No.           1.46e+10



In [17]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[17]:





<matplotlib.text.Text at 0x115eb0ef0>



In [18]:

    
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.text.Text at 0x116a21518>



In [19]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[19]:





<matplotlib.text.Text at 0x116c7e128>



In [20]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_total, tritopool.PDR_total)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1a.PDR_difference # dependent variable to predict

X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[20]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.046


  Model:                    OLS          Adj. R-squared:        0.032


  Method:              Least Squares     F-statistic:           3.277


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0220 


  Time:                  22:02:27        Log-Likelihood:       801.57


  No. Observations:          210         AIC:                  -1595.


  Df Residuals:              206         BIC:                  -1582.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    3.2772      7.294      0.449   0.654    -11.102    17.657


  total_cpg_no_filter   1.346e-08   4.86e-09      2.772   0.006   3.89e-09   2.3e-08


  bsRate_mean             -3.3844      7.550     -0.448   0.654    -18.270    11.501


  avgReadCpG_mean          0.0043      0.017      0.256   0.798     -0.029     0.037




  Omnibus:        12.553    Durbin-Watson:         2.493


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     10.855


  Skew:            0.478    Prob(JB):            0.00439


  Kurtosis:        2.428    Cond. No.           2.05e+10



In [21]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x116fbaa20>



In [22]:

    
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x1171f8748>



In [23]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x11750e240>



In [ ]:



In [24]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2.PDR_difference # dependent variable to predict

X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[24]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.032


  Model:                    OLS          Adj. R-squared:        0.017


  Method:              Least Squares     F-statistic:           2.065


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.106 


  Time:                  22:02:30        Log-Likelihood:       719.90


  No. Observations:          190         AIC:                  -1432.


  Df Residuals:              186         BIC:                  -1419.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    2.3266      2.490      0.935   0.351     -2.585     7.238


  total_cpg_no_filter  -7.732e-09   4.12e-09     -1.877   0.062  -1.59e-08  3.93e-10


  bsRate_mean             -2.4557      2.603     -0.943   0.347     -7.591     2.680


  avgReadCpG_mean          0.0082      0.006      1.272   0.205     -0.005     0.021




  Omnibus:        15.617    Durbin-Watson:         2.183


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      8.586


  Skew:            0.346    Prob(JB):             0.0137


  Kurtosis:        2.222    Cond. No.           3.35e+09



In [25]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[25]:





<matplotlib.text.Text at 0x115deee80>



In [26]:

    
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x117a34550>



In [27]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x117c6df28>



In [28]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2a.PDR_difference # dependent variable to predict

X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[28]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.062


  Model:                    OLS          Adj. R-squared:        0.047


  Method:              Least Squares     F-statistic:           4.078


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00781


  Time:                  22:02:33        Log-Likelihood:       702.79


  No. Observations:          190         AIC:                  -1398.


  Df Residuals:              186         BIC:                  -1385.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -9.5428      3.102     -3.076   0.002    -15.663    -3.423


  total_cpg_no_filter   3.477e-10   3.87e-09      0.090   0.929  -7.29e-09  7.98e-09


  bsRate_mean              9.9624      3.248      3.067   0.002      3.554    16.371


  avgReadCpG_mean         -0.0040      0.006     -0.691   0.490     -0.015     0.007




  Omnibus:         9.974    Durbin-Watson:         1.844


  Prob(Omnibus):   0.007    Jarque-Bera (JB):      8.038


  Skew:            0.408    Prob(JB):             0.0180


  Kurtosis:        2.410    Cond. No.           4.71e+09



In [29]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x117eff048>



In [30]:

    
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x1180f4a90>



In [31]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x1183ff0b8>



In [ ]:



In [ ]:



In [32]:

    
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_total, cw154.PDR_total)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2b.PDR_difference # dependent variable to predict

X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[32]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.086


  Model:                    OLS          Adj. R-squared:        0.071


  Method:              Least Squares     F-statistic:           5.833


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000789


  Time:                  22:02:37        Log-Likelihood:       650.81


  No. Observations:          190         AIC:                  -1294.


  Df Residuals:              186         BIC:                  -1281.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -3.2853      4.886     -0.672   0.502    -12.925     6.354


  total_cpg_no_filter  -2.349e-08   6.14e-09     -3.825   0.000  -3.56e-08 -1.14e-08


  bsRate_mean              3.3305      5.075      0.656   0.513     -6.682    13.343


  avgReadCpG_mean          0.0185      0.017      1.122   0.264     -0.014     0.051




  Omnibus:        10.567    Durbin-Watson:         1.836


  Prob(Omnibus):   0.005    Jarque-Bera (JB):     11.400


  Skew:            0.595    Prob(JB):            0.00335


  Kurtosis:        2.848    Cond. No.           6.97e+09



In [33]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x118695dd8>



In [34]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x1188a8b00>



In [35]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x118add828>



In [ ]:



In [36]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3.PDR_difference # dependent variable to predict

X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[36]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.041


  Model:                    OLS          Adj. R-squared:        0.019


  Method:              Least Squares     F-statistic:           1.860


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.139 


  Time:                  22:02:41        Log-Likelihood:       248.08


  No. Observations:          136         AIC:                  -488.2


  Df Residuals:              132         BIC:                  -476.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -3.3352     59.064     -0.056   0.955   -120.169   113.498


  total_cpg_no_filter  -1.239e-07   7.45e-08     -1.664   0.099  -2.71e-07  2.34e-08


  bsRate_mean              4.5060     59.116      0.076   0.939   -112.432   121.444


  avgReadCpG_mean         -0.2014      0.088     -2.281   0.024     -0.376    -0.027




  Omnibus:         8.744    Durbin-Watson:         1.824


  Prob(Omnibus):   0.013    Jarque-Bera (JB):      9.347


  Skew:            0.634    Prob(JB):            0.00934


  Kurtosis:        2.799    Cond. No.           7.73e+09



In [37]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x118d2afd0>



In [38]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x118f9f860>



In [39]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x1191d2668>



In [ ]:



In [40]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3a.PDR_difference # dependent variable to predict

X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[40]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.093


  Model:                    OLS          Adj. R-squared:        0.081


  Method:              Least Squares     F-statistic:           7.802


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  5.58e-05


  Time:                  22:02:45        Log-Likelihood:       521.68


  No. Observations:          231         AIC:                  -1035.


  Df Residuals:              227         BIC:                  -1022.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -63.1129     42.261     -1.493   0.137   -146.388    20.162


  total_cpg_no_filter   8.747e-08   3.32e-08      2.634   0.009    2.2e-08  1.53e-07


  bsRate_mean             62.7111     42.342      1.481   0.140    -20.723   146.146


  avgReadCpG_mean          0.1050      0.022      4.711   0.000      0.061     0.149




  Omnibus:        40.060    Durbin-Watson:         1.231


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     55.500


  Skew:            1.096    Prob(JB):           8.88e-13


  Kurtosis:        3.980    Cond. No.           1.03e+10



In [41]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[41]:





<matplotlib.text.Text at 0x1195fae80>



In [42]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x119845a90>



In [43]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x119b4e940>



In [ ]:



In [44]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3b.PDR_difference # dependent variable to predict

X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[44]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.143


  Model:                    OLS          Adj. R-squared:        0.062


  Method:              Least Squares     F-statistic:           1.776


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.172 


  Time:                  22:02:48        Log-Likelihood:       76.815


  No. Observations:           36         AIC:                  -145.6


  Df Residuals:               32         BIC:                  -139.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -132.3807    248.763     -0.532   0.598   -639.093   374.332


  total_cpg_no_filter  -2.629e-08   3.61e-08     -0.728   0.472  -9.98e-08  4.73e-08


  bsRate_mean            132.1700    248.978      0.531   0.599   -374.982   639.322


  avgReadCpG_mean          0.1103      0.091      1.213   0.234     -0.075     0.295




  Omnibus:         1.751    Durbin-Watson:         1.629


  Prob(Omnibus):   0.417    Jarque-Bera (JB):      1.636


  Skew:            0.476    Prob(JB):              0.441


  Kurtosis:        2.570    Cond. No.           3.28e+10



In [45]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[45]:





<matplotlib.text.Text at 0x119dabda0>



In [46]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x119fd35c0>



In [47]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x11a270cc0>



In [ ]:



In [ ]:



In [48]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_total, pcell.PDR_total)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3c.PDR_difference # dependent variable to predict

X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[48]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.006


  Model:                    OLS          Adj. R-squared:       -0.010


  Method:              Least Squares     F-statistic:          0.3677


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.776 


  Time:                  22:02:51        Log-Likelihood:       396.97


  No. Observations:          190         AIC:                  -785.9


  Df Residuals:              186         BIC:                  -772.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -4.1828     47.709     -0.088   0.930    -98.302    89.937


  total_cpg_no_filter   3.348e-08   3.77e-08      0.888   0.376  -4.09e-08  1.08e-07


  bsRate_mean              4.2617     47.701      0.089   0.929    -89.842    98.365


  avgReadCpG_mean         -0.0064      0.060     -0.106   0.916     -0.125     0.112




  Omnibus:        12.423    Durbin-Watson:         1.565


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     13.696


  Skew:            0.650    Prob(JB):            0.00106


  Kurtosis:        2.801    Cond. No.           8.67e+09



In [49]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.text.Text at 0x11a4fb470>



In [50]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x11a72c5c0>



In [51]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x11aa5df98>



In [ ]:



In [ ]:



In [52]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4.PDR_difference # dependent variable to predict

X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[52]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.249


  Model:                    OLS          Adj. R-squared:        0.234


  Method:              Least Squares     F-statistic:           16.50


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.61e-09


  Time:                  22:02:54        Log-Likelihood:       435.43


  No. Observations:          153         AIC:                  -862.9


  Df Residuals:              149         BIC:                  -850.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  220.3880     35.694      6.174   0.000    149.855   290.921


  total_cpg_no_filter  -5.719e-08   2.24e-08     -2.549   0.012  -1.02e-07 -1.29e-08


  bsRate_mean           -221.2889     35.848     -6.173   0.000   -292.125  -150.453


  avgReadCpG_mean          0.0619      0.024      2.571   0.011      0.014     0.109




  Omnibus:        51.119    Durbin-Watson:         1.597


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     95.901


  Skew:            1.592    Prob(JB):           1.50e-21


  Kurtosis:        5.216    Cond. No.           1.42e+10



In [53]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[53]:





<matplotlib.text.Text at 0x11aca99b0>



In [54]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x11af459e8>



In [55]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x11b304e10>



In [ ]:



In [56]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4a.PDR_difference # dependent variable to predict

X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[56]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.108


  Model:                    OLS          Adj. R-squared:        0.090


  Method:              Least Squares     F-statistic:           5.991


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000698


  Time:                  22:02:57        Log-Likelihood:       383.43


  No. Observations:          153         AIC:                  -758.9


  Df Residuals:              149         BIC:                  -746.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -17.2786     60.573     -0.285   0.776   -136.971   102.414


  total_cpg_no_filter   1.186e-07   4.37e-08      2.715   0.007   3.23e-08  2.05e-07


  bsRate_mean             17.3791     60.728      0.286   0.775   -102.621   137.379


  avgReadCpG_mean         -0.0151      0.048     -0.313   0.755     -0.111     0.081




  Omnibus:        71.046    Durbin-Watson:         0.233


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    175.322


  Skew:            2.060    Prob(JB):           8.50e-39


  Kurtosis:        6.244    Cond. No.           1.55e+10



In [57]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[57]:





<matplotlib.text.Text at 0x11b58fbe0>



In [58]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x11b83a9b0>



In [59]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x11bba2470>



In [ ]:



In [60]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4b.PDR_difference # dependent variable to predict

X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[60]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.199


  Model:                    OLS          Adj. R-squared:        0.181


  Method:              Least Squares     F-statistic:           10.93


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.84e-06


  Time:                  22:03:01        Log-Likelihood:       587.59


  No. Observations:          136         AIC:                  -1167.


  Df Residuals:              132         BIC:                  -1156.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -39.2691      8.576     -4.579   0.000    -56.234   -22.304


  total_cpg_no_filter   2.036e-10   7.06e-09      0.029   0.977  -1.38e-08  1.42e-08


  bsRate_mean             39.3142      8.593      4.575   0.000     22.317    56.311


  avgReadCpG_mean          0.0118      0.005      2.176   0.031      0.001     0.022




  Omnibus:         2.812    Durbin-Watson:         1.787


  Prob(Omnibus):   0.245    Jarque-Bera (JB):      2.853


  Skew:            0.332    Prob(JB):              0.240


  Kurtosis:        2.748    Cond. No.           1.27e+10



In [61]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[61]:





<matplotlib.text.Text at 0x11be5fda0>



In [62]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x11c0bb278>



In [63]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x11c4a6a90>



In [ ]:



In [64]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_total, mcell.PDR_total)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4c.PDR_difference # dependent variable to predict

X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[64]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.253


  Model:                    OLS          Adj. R-squared:        0.242


  Method:              Least Squares     F-statistic:           23.21


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  5.52e-13


  Time:                  22:03:05        Log-Likelihood:       589.90


  No. Observations:          210         AIC:                  -1172.


  Df Residuals:              206         BIC:                  -1158.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -346.4377     63.275     -5.475   0.000   -471.187  -221.689


  total_cpg_no_filter   1.915e-07   2.34e-08      8.166   0.000   1.45e-07  2.38e-07


  bsRate_mean            346.6396     63.391      5.468   0.000    221.661   471.618


  avgReadCpG_mean          0.1123      0.020      5.584   0.000      0.073     0.152




  Omnibus:        63.195    Durbin-Watson:         0.326


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    121.871


  Skew:            1.505    Prob(JB):           3.44e-27


  Kurtosis:        5.207    Cond. No.           3.04e+10



In [65]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[65]:





<matplotlib.text.Text at 0x11c6f30b8>



In [66]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x11ca1bb38>



In [67]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x11ccb8128>



In [ ]:



In [68]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5.PDR_difference # dependent variable to predict

X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[68]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.088


  Model:                    OLS          Adj. R-squared:        0.075


  Method:              Least Squares     F-statistic:           6.645


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000264


  Time:                  22:03:09        Log-Likelihood:       335.17


  No. Observations:          210         AIC:                  -662.3


  Df Residuals:              206         BIC:                  -648.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -288.9939     81.378     -3.551   0.000   -449.434  -128.554


  total_cpg_no_filter  -2.498e-08   2.03e-08     -1.231   0.220   -6.5e-08   1.5e-08


  bsRate_mean            290.3424     81.594      3.558   0.000    129.475   451.209


  avgReadCpG_mean         -0.0881      0.055     -1.597   0.112     -0.197     0.021




  Omnibus:        100.022    Durbin-Watson:         1.953


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.494


  Skew:            0.291     Prob(JB):           0.000432


  Kurtosis:        1.803     Cond. No.           1.48e+10



In [69]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[69]:





<matplotlib.text.Text at 0x11cfb1320>



In [70]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x11d1f5828>



In [71]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x11d4ff630>



In [ ]:



In [72]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5a.PDR_difference # dependent variable to predict

X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[72]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.154


  Model:                    OLS          Adj. R-squared:        0.141


  Method:              Least Squares     F-statistic:           11.30


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  7.60e-07


  Time:                  22:03:13        Log-Likelihood:       358.88


  No. Observations:          190         AIC:                  -709.8


  Df Residuals:              186         BIC:                  -696.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -216.7577     41.261     -5.253   0.000   -298.157  -135.359


  total_cpg_no_filter   8.967e-09    1.2e-08      0.744   0.458  -1.48e-08  3.27e-08


  bsRate_mean            218.0187     41.427      5.263   0.000    136.291   299.746


  avgReadCpG_mean         -0.1062      0.045     -2.380   0.018     -0.194    -0.018




  Omnibus:        16.410    Durbin-Watson:         1.075


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     18.736


  Skew:            0.769    Prob(JB):           8.54e-05


  Kurtosis:        3.025    Cond. No.           1.63e+10



In [73]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[73]:





<matplotlib.text.Text at 0x11d864ef0>



In [74]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x11dab0630>



In [75]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x11db87c88>



In [ ]:



In [76]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5b.PDR_difference # dependent variable to predict

X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[76]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.247


  Model:                    OLS          Adj. R-squared:        0.238


  Method:              Least Squares     F-statistic:           24.89


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  5.86e-14


  Time:                  22:03:17        Log-Likelihood:       408.80


  No. Observations:          231         AIC:                  -809.6


  Df Residuals:              227         BIC:                  -795.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -772.3182    108.817     -7.097   0.000   -986.739  -557.898


  total_cpg_no_filter   3.354e-08   1.63e-08      2.055   0.041   1.38e-09  6.57e-08


  bsRate_mean            774.9822    109.008      7.109   0.000    560.186   989.779


  avgReadCpG_mean         -0.2205      0.062     -3.535   0.000     -0.343    -0.098




  Omnibus:        16.001    Durbin-Watson:         1.644


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     17.820


  Skew:            0.665    Prob(JB):           0.000135


  Kurtosis:        2.711    Cond. No.           2.98e+10



In [77]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[77]:





<matplotlib.text.Text at 0x11dfb5a58>



In [78]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x11e205518>



In [79]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x11e50c4a8>



In [ ]:



In [80]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_total, CD19cell.PDR_total)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5c.PDR_difference # dependent variable to predict

X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[80]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.057


  Model:                    OLS          Adj. R-squared:        0.042


  Method:              Least Squares     F-statistic:           3.741


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0121 


  Time:                  22:03:21        Log-Likelihood:       324.90


  No. Observations:          190         AIC:                  -641.8


  Df Residuals:              186         BIC:                  -628.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -128.2866     89.855     -1.428   0.155   -305.553    48.980


  total_cpg_no_filter  -2.106e-08   1.72e-08     -1.221   0.223  -5.51e-08   1.3e-08


  bsRate_mean            129.1661     90.075      1.434   0.153    -48.534   306.866


  avgReadCpG_mean         -0.1056      0.044     -2.384   0.018     -0.193    -0.018




  Omnibus:        11.622    Durbin-Watson:         2.479


  Prob(Omnibus):   0.003    Jarque-Bera (JB):      7.442


  Skew:            0.338    Prob(JB):             0.0242


  Kurtosis:        2.306    Cond. No.           1.62e+10



In [81]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[81]:





<matplotlib.text.Text at 0x11e85a748>



In [82]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x11eac4828>



In [83]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x11edd8e48>



In [ ]:



In [84]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6.PDR_difference # dependent variable to predict

X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[84]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.066


  Model:                    OLS          Adj. R-squared:        0.049


  Method:              Least Squares     F-statistic:           3.934


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00958


  Time:                  22:03:25        Log-Likelihood:       246.73


  No. Observations:          171         AIC:                  -485.5


  Df Residuals:              167         BIC:                  -472.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -11.1577     15.893     -0.702   0.484    -42.535    20.219


  total_cpg_no_filter   6.745e-08    1.7e-07      0.396   0.692  -2.69e-07  4.03e-07


  bsRate_mean             13.6349     16.580      0.822   0.412    -19.098    46.368


  avgReadCpG_mean         -0.3516      0.105     -3.343   0.001     -0.559    -0.144




  Omnibus:        16.967    Durbin-Watson:         1.918


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      5.746


  Skew:            0.074    Prob(JB):             0.0565


  Kurtosis:        2.114    Cond. No.           9.64e+08



In [85]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[85]:





<matplotlib.text.Text at 0x11f056ba8>



In [86]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x11f29b630>



In [87]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x11f4c67b8>



In [ ]:



In [88]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6a.PDR_difference # dependent variable to predict

X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[88]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.006


  Model:                    OLS          Adj. R-squared:       -0.010


  Method:              Least Squares     F-statistic:          0.3682


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.776 


  Time:                  22:03:29        Log-Likelihood:       251.53


  No. Observations:          190         AIC:                  -495.1


  Df Residuals:              186         BIC:                  -482.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -26.2541     41.573     -0.632   0.528   -108.270    55.761


  total_cpg_no_filter     1.9e-08   5.39e-08      0.353   0.725  -8.73e-08  1.25e-07


  bsRate_mean             27.7647     43.574      0.637   0.525    -58.198   113.727


  avgReadCpG_mean         -0.0651      0.089     -0.729   0.467     -0.241     0.111




  Omnibus:        381.466    Durbin-Watson:         1.986


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.970


  Skew:            0.127     Prob(JB):           0.000341


  Kurtosis:        1.603     Cond. No.           4.74e+09



In [89]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[89]:





<matplotlib.text.Text at 0x11e82fc50>



In [90]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x11f9aa908>



In [91]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x11fcad4e0>



In [ ]:



In [92]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6b.PDR_difference # dependent variable to predict

X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[92]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.026


  Model:                    OLS          Adj. R-squared:        0.009


  Method:              Least Squares     F-statistic:           1.493


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.218 


  Time:                  22:03:33        Log-Likelihood:       238.33


  No. Observations:          171         AIC:                  -468.7


  Df Residuals:              167         BIC:                  -456.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   77.9696     38.634      2.018   0.045      1.696   154.243


  total_cpg_no_filter   2.131e-08   4.12e-08      0.517   0.606  -6.01e-08  1.03e-07


  bsRate_mean            -81.8433     40.423     -2.025   0.044   -161.649    -2.037


  avgReadCpG_mean          0.1606      0.119      1.350   0.179     -0.074     0.396




  Omnibus:        65.201    Durbin-Watson:         1.106


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      9.989


  Skew:            0.068    Prob(JB):            0.00677


  Kurtosis:        1.824    Cond. No.           7.61e+09



In [93]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[93]:





<matplotlib.text.Text at 0x11ff19e10>



In [94]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[94]:





<matplotlib.text.Text at 0x12012ff60>



In [95]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x120434c50>



In [ ]:



In [96]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6c.PDR_difference # dependent variable to predict

X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[96]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.017


  Model:                    OLS          Adj. R-squared:       -0.001


  Method:              Least Squares     F-statistic:          0.9516


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.417 


  Time:                  22:03:36        Log-Likelihood:       245.03


  No. Observations:          171         AIC:                  -482.1


  Df Residuals:              167         BIC:                  -469.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   42.9510     25.806      1.664   0.098     -7.997    93.899


  total_cpg_no_filter   4.945e-09   3.55e-08      0.139   0.889  -6.51e-08   7.5e-08


  bsRate_mean            -44.7683     26.951     -1.661   0.099    -97.977     8.440


  avgReadCpG_mean          0.0318      0.085      0.373   0.710     -0.137     0.200




  Omnibus:        371.562    Durbin-Watson:         2.061


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.713


  Skew:            0.223     Prob(JB):           0.000387


  Kurtosis:        1.584     Cond. No.           5.21e+09



In [97]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[97]:





<matplotlib.text.Text at 0x120771f28>



In [98]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.text.Text at 0x120981ef0>



In [99]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x120c89eb8>



In [ ]:



In [100]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6d.PDR_difference # dependent variable to predict

X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[100]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.077


  Model:                    OLS          Adj. R-squared:        0.061


  Method:              Least Squares     F-statistic:           4.672


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00367


  Time:                  22:03:39        Log-Likelihood:       258.29


  No. Observations:          171         AIC:                  -508.6


  Df Residuals:              167         BIC:                  -496.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -22.1956     26.747     -0.830   0.408    -75.002    30.611


  total_cpg_no_filter   2.302e-08    4.9e-08      0.470   0.639  -7.37e-08   1.2e-07


  bsRate_mean             21.7106     28.252      0.768   0.443    -34.067    77.488


  avgReadCpG_mean          0.2628      0.121      2.177   0.031      0.024     0.501




  Omnibus:        35.431    Durbin-Watson:         1.950


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.460


  Skew:            0.317    Prob(JB):            0.00535


  Kurtosis:        1.967    Cond. No.           4.87e+09



In [101]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[101]:





<matplotlib.text.Text at 0x120fb2518>



In [102]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[102]:





<matplotlib.text.Text at 0x1211df898>



In [103]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[103]:





<matplotlib.text.Text at 0x1214f7080>



In [ ]:



In [104]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_total, normb.PDR_total)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6e.PDR_difference # dependent variable to predict

X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_total")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_total

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[104]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.124


  Model:                    OLS          Adj. R-squared:        0.098


  Method:              Least Squares     F-statistic:           4.765


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00378


  Time:                  22:03:42        Log-Likelihood:       156.42


  No. Observations:          105         AIC:                  -304.8


  Df Residuals:              101         BIC:                  -294.2


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   81.3854     47.701      1.706   0.091    -13.240   176.011


  total_cpg_no_filter   2.279e-07   7.13e-08      3.198   0.002   8.65e-08  3.69e-07


  bsRate_mean            -81.9604     49.381     -1.660   0.100   -179.919    15.999


  avgReadCpG_mean         -0.4986      0.216     -2.303   0.023     -0.928    -0.069




  Omnibus:        13.148    Durbin-Watson:         1.372


  Prob(Omnibus):   0.001    Jarque-Bera (JB):      9.844


  Skew:            0.634    Prob(JB):            0.00728


  Kurtosis:        2.198    Cond. No.           5.84e+09



In [105]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[105]:





<matplotlib.text.Text at 0x1218159e8>



In [106]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[106]:





<matplotlib.text.Text at 0x121a40278>



In [107]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[107]:





<matplotlib.text.Text at 0x121be1da0>



In [108]:

    
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)









    



(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)



In [109]:

    
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]



In [110]:

    
total_pairs = pd.concat(frames)



In [111]:

    
total_pairs.shape









    Out[111]:





(4035, 15)



In [112]:

    
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)



In [113]:

    
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_total")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_total






    Out[113]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:              0.292 


  Model:                    OLS          Adj. R-squared:         0.292 


  Method:              Least Squares     F-statistic:            416.1 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   1.70e-300


  Time:                  22:03:46        Log-Likelihood:        7168.7 


  No. Observations:         4035         AIC:                -1.433e+04


  Df Residuals:             4030         BIC:                -1.430e+04


  Df Model:                    4                                       


  Covariance Type:       nonrobust                                     




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.5166      0.056     26.926   0.000      1.406     1.627


  total_cpg_no_filter   2.212e-08   3.55e-09      6.226   0.000   1.52e-08  2.91e-08


  bsRate_mean             -1.2101      0.042    -28.825   0.000     -1.292    -1.128


  avgReadCpG_mean         -0.0528      0.007     -7.241   0.000     -0.067    -0.038


  type_CLL                -0.0632      0.002    -36.274   0.000     -0.067    -0.060




  Omnibus:        441.585    Durbin-Watson:         1.572 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    605.853 


  Skew:            0.874     Prob(JB):           2.76e-132


  Kurtosis:        3.740     Cond. No.           4.97e+07



In [114]:

    
X.head()









    Out[114]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
      type_CLL
    
  
  
    
      0
      1
      709541.5
      0.980181
      5.354731
      1.0
    
    
      1
      1
      786284.0
      0.980293
      5.367319
      1.0
    
    
      2
      1
      683623.5
      0.980268
      5.369633
      1.0
    
    
      3
      1
      853200.5
      0.980262
      5.367604
      1.0
    
    
      4
      1
      842686.5
      0.980228
      5.374553
      1.0



In [115]:

    
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



PDR_difference vs. bsRate, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[115]:





<matplotlib.text.Text at 0x121f36fd0>



In [116]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[116]:





<matplotlib.text.Text at 0x122224d30>



In [117]:

    
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[117]:





<matplotlib.text.Text at 0x122515748>



In [118]:

    
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")









    



PDR_difference vs. bsRate, by type! 






    Out[118]:





<matplotlib.text.Text at 0x1228fa358>



In [119]:

    
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. avgReadCpG per cell, by type! 






    Out[119]:





<matplotlib.text.Text at 0x122969f60>



In [120]:

    
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. total unique CpG per cel, by type!






    Out[120]:





<matplotlib.text.Text at 0x122b316a0>



In [121]:

    
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.898261797205



In [122]:

    
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)









    



(990, 15)
(3045, 15)



In [123]:

    
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(990, 4)






    Out[123]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.030


  Model:                    OLS          Adj. R-squared:        0.027


  Method:              Least Squares     F-statistic:           10.20


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.28e-06


  Time:                  22:06:21        Log-Likelihood:       3620.6


  No. Observations:          990         AIC:                  -7233.


  Df Residuals:              986         BIC:                  -7214.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.1639      0.040      4.098   0.000      0.085     0.242


  total_cpg_no_filter   7.963e-10    1.7e-09      0.468   0.640  -2.54e-09  4.14e-09


  bsRate_mean             -0.1458      0.038     -3.819   0.000     -0.221    -0.071


  avgReadCpG_mean         -0.0026      0.003     -0.954   0.340     -0.008     0.003




  Omnibus:        90.903    Durbin-Watson:         1.967


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    114.285


  Skew:            0.807    Prob(JB):           1.52e-25


  Kurtosis:        3.404    Cond. No.           1.63e+08



In [124]:

    
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  CLL pairs ")









    



PDR_difference vs. bsRate, jointplot, CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[124]:





<matplotlib.text.Text at 0x1a96b4898>



In [125]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[125]:





<matplotlib.text.Text at 0x234e04828>



In [126]:

    
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell,  CLL pairs")









    



PDR difference vs. total # unique CpGs per cell, CLL pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[126]:





<matplotlib.text.Text at 0x2032ea940>



In [127]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.879297543226



In [ ]:



In [128]:

    
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(3045, 4)






    Out[128]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.193 


  Model:                    OLS          Adj. R-squared:        0.193 


  Method:              Least Squares     F-statistic:           243.2 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.97e-141


  Time:                  22:07:04        Log-Likelihood:       5003.2 


  No. Observations:         3045         AIC:                  -9998. 


  Df Residuals:             3041         BIC:                  -9974. 


  Df Model:                    3                                      


  Covariance Type:       nonrobust                                    




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.6078      0.068     23.720   0.000      1.475     1.741


  total_cpg_no_filter   1.737e-08   4.76e-09      3.652   0.000   8.04e-09  2.67e-08


  bsRate_mean             -1.2732      0.050    -25.232   0.000     -1.372    -1.174


  avgReadCpG_mean         -0.0578      0.009     -6.280   0.000     -0.076    -0.040




  Omnibus:        228.977    Durbin-Watson:         1.579


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    282.648


  Skew:            0.744     Prob(JB):           4.21e-62


  Kurtosis:        2.894     Cond. No.           4.12e+07



In [129]:

    
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")









    



PDR_difference vs. bsRate, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[129]:





<matplotlib.text.Text at 0x1233014a8>



In [130]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[130]:





<matplotlib.text.Text at 0x1238d9048>



In [131]:

    
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")









    



PDR diference vs. total # unique CpGs per cell, Normal pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[131]:





<matplotlib.text.Text at 0x123be0208>



In [132]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.883111512587



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	filename	methylation	PDR_total	methylation_unweighted	PDR_unweighted	thisMeth	mixedReadCount	total_reads	type	bio	protocol	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	bsRate	avgReadCpgs_nofilter	avgReadCpgs_lessthan1CpG	avgReadCpgs_gtreql3.8CpG
0	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC	0.529505	0.235795	0.632802	0.231878	2208325.0	983394.0	4170549.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	525282.0	525251.0	435636.0	0.9975	5.354284	5.355660	7.019255
1	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG	0.455550	0.177631	0.583859	0.175371	733064.0	285841.0	1609185.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	221972.0	221962.0	186757.0	0.9975	5.587294	5.588449	7.302612
2	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG	0.515269	0.177645	0.618578	0.174221	1452802.0	500870.0	2819500.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	355730.0	355713.0	295624.0	0.9975	5.393199	5.394331	7.079288
3	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG	0.556175	0.176367	0.652727	0.172273	2279354.0	722800.0	4098270.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	483179.0	483150.0	397812.0	0.9975	5.287116	5.288477	6.979525
4	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC	0.528642	0.181331	0.640401	0.172287	1394208.0	478231.0	2637340.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	356122.0	356100.0	294065.0	0.9975	5.314302	5.315719	6.995052

Dep. Variable:	PDR_difference	R-squared:	0.016
Model:	OLS	Adj. R-squared:	0.001
Method:	Least Squares	F-statistic:	1.084
Date:	Tue, 09 Aug 2016	Prob (F-statistic):	0.357
Time:	22:02:23	Log-Likelihood:	826.59
No. Observations:	210	AIC:	-1645.
Df Residuals:	206	BIC:	-1632.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	0.0685	4.560	0.015	0.988	-8.921 9.058
total_cpg_no_filter	-4.835e-09	5.1e-09	-0.949	0.344	-1.49e-08 5.21e-09
bsRate_mean	-0.0338	4.652	-0.007	0.994	-9.206 9.138
avgReadCpG_mean	-0.0047	0.012	-0.380	0.705	-0.029 0.020

Omnibus:	14.277	Durbin-Watson:	1.921
Prob(Omnibus):	0.001	Jarque-Bera (JB):	14.938
Skew:	0.618	Prob(JB):	0.000570
Kurtosis:	2.575	Cond. No.	1.46e+10

Omnibus:	12.553	Durbin-Watson:	2.493
Prob(Omnibus):	0.002	Jarque-Bera (JB):	10.855
Skew:	0.478	Prob(JB):	0.00439
Kurtosis:	2.428	Cond. No.	2.05e+10

Omnibus:	15.617	Durbin-Watson:	2.183
Prob(Omnibus):	0.000	Jarque-Bera (JB):	8.586
Skew:	0.346	Prob(JB):	0.0137
Kurtosis:	2.222	Cond. No.	3.35e+09

Omnibus:	9.974	Durbin-Watson:	1.844
Prob(Omnibus):	0.007	Jarque-Bera (JB):	8.038
Skew:	0.408	Prob(JB):	0.0180
Kurtosis:	2.410	Cond. No.	4.71e+09

Omnibus:	10.567	Durbin-Watson:	1.836
Prob(Omnibus):	0.005	Jarque-Bera (JB):	11.400
Skew:	0.595	Prob(JB):	0.00335
Kurtosis:	2.848	Cond. No.	6.97e+09

Omnibus:	8.744	Durbin-Watson:	1.824
Prob(Omnibus):	0.013	Jarque-Bera (JB):	9.347
Skew:	0.634	Prob(JB):	0.00934
Kurtosis:	2.799	Cond. No.	7.73e+09

Omnibus:	40.060	Durbin-Watson:	1.231
Prob(Omnibus):	0.000	Jarque-Bera (JB):	55.500
Skew:	1.096	Prob(JB):	8.88e-13
Kurtosis:	3.980	Cond. No.	1.03e+10

Omnibus:	1.751	Durbin-Watson:	1.629
Prob(Omnibus):	0.417	Jarque-Bera (JB):	1.636
Skew:	0.476	Prob(JB):	0.441
Kurtosis:	2.570	Cond. No.	3.28e+10

Omnibus:	12.423	Durbin-Watson:	1.565
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.696
Skew:	0.650	Prob(JB):	0.00106
Kurtosis:	2.801	Cond. No.	8.67e+09

Omnibus:	51.119	Durbin-Watson:	1.597
Prob(Omnibus):	0.000	Jarque-Bera (JB):	95.901
Skew:	1.592	Prob(JB):	1.50e-21
Kurtosis:	5.216	Cond. No.	1.42e+10

Omnibus:	71.046	Durbin-Watson:	0.233
Prob(Omnibus):	0.000	Jarque-Bera (JB):	175.322
Skew:	2.060	Prob(JB):	8.50e-39
Kurtosis:	6.244	Cond. No.	1.55e+10

Omnibus:	2.812	Durbin-Watson:	1.787
Prob(Omnibus):	0.245	Jarque-Bera (JB):	2.853
Skew:	0.332	Prob(JB):	0.240
Kurtosis:	2.748	Cond. No.	1.27e+10

Omnibus:	63.195	Durbin-Watson:	0.326
Prob(Omnibus):	0.000	Jarque-Bera (JB):	121.871
Skew:	1.505	Prob(JB):	3.44e-27
Kurtosis:	5.207	Cond. No.	3.04e+10

Omnibus:	100.022	Durbin-Watson:	1.953
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.494
Skew:	0.291	Prob(JB):	0.000432
Kurtosis:	1.803	Cond. No.	1.48e+10

Omnibus:	16.410	Durbin-Watson:	1.075
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.736
Skew:	0.769	Prob(JB):	8.54e-05
Kurtosis:	3.025	Cond. No.	1.63e+10

Omnibus:	16.001	Durbin-Watson:	1.644
Prob(Omnibus):	0.000	Jarque-Bera (JB):	17.820
Skew:	0.665	Prob(JB):	0.000135
Kurtosis:	2.711	Cond. No.	2.98e+10

Omnibus:	11.622	Durbin-Watson:	2.479
Prob(Omnibus):	0.003	Jarque-Bera (JB):	7.442
Skew:	0.338	Prob(JB):	0.0242
Kurtosis:	2.306	Cond. No.	1.62e+10

Omnibus:	16.967	Durbin-Watson:	1.918
Prob(Omnibus):	0.000	Jarque-Bera (JB):	5.746
Skew:	0.074	Prob(JB):	0.0565
Kurtosis:	2.114	Cond. No.	9.64e+08

Omnibus:	381.466	Durbin-Watson:	1.986
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.970
Skew:	0.127	Prob(JB):	0.000341
Kurtosis:	1.603	Cond. No.	4.74e+09

Omnibus:	65.201	Durbin-Watson:	1.106
Prob(Omnibus):	0.000	Jarque-Bera (JB):	9.989
Skew:	0.068	Prob(JB):	0.00677
Kurtosis:	1.824	Cond. No.	7.61e+09

Omnibus:	371.562	Durbin-Watson:	2.061
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.713
Skew:	0.223	Prob(JB):	0.000387
Kurtosis:	1.584	Cond. No.	5.21e+09

Omnibus:	35.431	Durbin-Watson:	1.950
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.460
Skew:	0.317	Prob(JB):	0.00535
Kurtosis:	1.967	Cond. No.	4.87e+09

Omnibus:	13.148	Durbin-Watson:	1.372
Prob(Omnibus):	0.001	Jarque-Bera (JB):	9.844
Skew:	0.634	Prob(JB):	0.00728
Kurtosis:	2.198	Cond. No.	5.84e+09

Omnibus:	441.585	Durbin-Watson:	1.572
Prob(Omnibus):	0.000	Jarque-Bera (JB):	605.853
Skew:	0.874	Prob(JB):	2.76e-132
Kurtosis:	3.740	Cond. No.	4.97e+07

	const	total_cpg_no_filter	bsRate_mean	avgReadCpG_mean	type_CLL
0	1	709541.5	0.980181	5.354731	1.0
1	1	786284.0	0.980293	5.367319	1.0
2	1	683623.5	0.980268	5.369633	1.0
3	1	853200.5	0.980262	5.367604	1.0
4	1	842686.5	0.980228	5.374553	1.0

Omnibus:	90.903	Durbin-Watson:	1.967
Prob(Omnibus):	0.000	Jarque-Bera (JB):	114.285
Skew:	0.807	Prob(JB):	1.52e-25
Kurtosis:	3.404	Cond. No.	1.63e+08

Omnibus:	228.977	Durbin-Watson:	1.579
Prob(Omnibus):	0.000	Jarque-Bera (JB):	282.648
Skew:	0.744	Prob(JB):	4.21e-62
Kurtosis:	2.894	Cond. No.	4.12e+07