notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm



In [4]:

    
stats = pd.read_csv("all_RRBS_statistics_final.csv")



In [5]:

    
stats.shape









    Out[5]:





(438, 15)



In [6]:

    
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]



In [7]:

    
len(normal)









    Out[7]:





336



In [8]:

    
len(CLL)









    Out[8]:





102



In [9]:

    
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")



In [10]:

    
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)



In [11]:

    
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])



In [12]:

    
cpg_total.shape









    Out[12]:





(513, 4)



In [13]:

    
merged = stats.merge(cpg_total, on="filename")



In [14]:

    
merged.shape









    Out[14]:





(438, 18)



In [15]:

    
merged.head()









    Out[15]:






  
    
      
      filename
      methylation
      PDR_total
      methylation_unweighted
      PDR_unweighted
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      bsRate
      avgReadCpgs_nofilter
      avgReadCpgs_lessthan1CpG
      avgReadCpgs_gtreql3.8CpG
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC
      0.529505
      0.235795
      0.632802
      0.231878
      2208325.0
      983394.0
      4170549.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      525282.0
      525251.0
      435636.0
      0.9975
      5.354284
      5.355660
      7.019255
    
    
      1
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG
      0.455550
      0.177631
      0.583859
      0.175371
      733064.0
      285841.0
      1609185.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      221972.0
      221962.0
      186757.0
      0.9975
      5.587294
      5.588449
      7.302612
    
    
      2
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG
      0.515269
      0.177645
      0.618578
      0.174221
      1452802.0
      500870.0
      2819500.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      355730.0
      355713.0
      295624.0
      0.9975
      5.393199
      5.394331
      7.079288
    
    
      3
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG
      0.556175
      0.176367
      0.652727
      0.172273
      2279354.0
      722800.0
      4098270.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      483179.0
      483150.0
      397812.0
      0.9975
      5.287116
      5.288477
      6.979525
    
    
      4
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC
      0.528642
      0.181331
      0.640401
      0.172287
      1394208.0
      478231.0
      2637340.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      356122.0
      356100.0
      294065.0
      0.9975
      5.314302
      5.315719
      6.995052



In [ ]:



In [ ]:



In [16]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)

pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1.PDR_difference # dependent variable to predict

X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[16]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.047


  Model:                    OLS          Adj. R-squared:        0.033


  Method:              Least Squares     F-statistic:           3.374


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0194 


  Time:                  23:49:50        Log-Likelihood:       884.39


  No. Observations:          210         AIC:                  -1761.


  Df Residuals:              206         BIC:                  -1747.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    8.7620      3.468      2.526   0.012      1.925    15.599


  total_cpg_no_filter    6.31e-09   3.43e-09      1.842   0.067  -4.45e-10  1.31e-08


  bsRate_mean             -9.0390      3.536     -2.556   0.011    -16.011    -2.067


  avgReadCpG_mean          0.0140      0.009      1.600   0.111     -0.003     0.031




  Omnibus:        11.066    Durbin-Watson:         2.056


  Prob(Omnibus):   0.004    Jarque-Bera (JB):     11.795


  Skew:            0.562    Prob(JB):            0.00275


  Kurtosis:        2.707    Cond. No.           1.46e+10



In [17]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[17]:





<matplotlib.text.Text at 0x10bc6e940>



In [18]:

    
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.text.Text at 0x10c81d710>



In [19]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[19]:





<matplotlib.text.Text at 0x10ca78908>



In [20]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1a.PDR_difference # dependent variable to predict

X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[20]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.050


  Model:                    OLS          Adj. R-squared:        0.036


  Method:              Least Squares     F-statistic:           3.605


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0143 


  Time:                  23:49:53        Log-Likelihood:       862.29


  No. Observations:          210         AIC:                  -1717.


  Df Residuals:              206         BIC:                  -1703.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    5.4407      4.471      1.217   0.225     -3.373    14.255


  total_cpg_no_filter     9.8e-09   3.56e-09      2.753   0.006   2.78e-09  1.68e-08


  bsRate_mean             -5.4892      4.636     -1.184   0.238    -14.630     3.652


  avgReadCpG_mean         -0.0120      0.011     -1.069   0.286     -0.034     0.010




  Omnibus:        13.995    Durbin-Watson:         2.461


  Prob(Omnibus):   0.001    Jarque-Bera (JB):      9.802


  Skew:            0.406    Prob(JB):            0.00744


  Kurtosis:        2.322    Cond. No.           1.68e+10



In [21]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x10cdc5358>



In [22]:

    
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x10cff3c88>



In [23]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x10d3014a8>



In [ ]:



In [24]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2.PDR_difference # dependent variable to predict

X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[24]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.029


  Model:                    OLS          Adj. R-squared:        0.014


  Method:              Least Squares     F-statistic:           1.879


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.135 


  Time:                  23:49:56        Log-Likelihood:       782.07


  No. Observations:          190         AIC:                  -1556.


  Df Residuals:              186         BIC:                  -1543.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -3.3290      1.748     -1.904   0.058     -6.778     0.120


  total_cpg_no_filter  -1.609e-09   2.96e-09     -0.543   0.588  -7.45e-09  4.24e-09


  bsRate_mean              3.5459      1.830      1.938   0.054     -0.064     7.156


  avgReadCpG_mean         -0.0102      0.006     -1.831   0.069     -0.021     0.001




  Omnibus:        13.706    Durbin-Watson:         2.228


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     10.458


  Skew:            0.469    Prob(JB):            0.00536


  Kurtosis:        2.335    Cond. No.           3.27e+09



In [25]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[25]:





<matplotlib.text.Text at 0x10d06d198>



In [26]:

    
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x10d81b208>



In [27]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x10da56048>



In [28]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2a.PDR_difference # dependent variable to predict

X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[28]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.131


  Model:                    OLS          Adj. R-squared:        0.117


  Method:              Least Squares     F-statistic:           9.323


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  8.98e-06


  Time:                  23:50:00        Log-Likelihood:       754.47


  No. Observations:          190         AIC:                  -1501.


  Df Residuals:              186         BIC:                  -1488.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -5.8954      2.242     -2.630   0.009    -10.318    -1.472


  total_cpg_no_filter  -5.239e-09   2.77e-09     -1.891   0.060  -1.07e-08  2.26e-10


  bsRate_mean              6.0663      2.352      2.579   0.011      1.425    10.707


  avgReadCpG_mean          0.0105      0.005      2.292   0.023      0.001     0.020




  Omnibus:        16.408    Durbin-Watson:         1.723


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     18.113


  Skew:            0.740    Prob(JB):           0.000117


  Kurtosis:        3.314    Cond. No.           4.47e+09



In [29]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x10dcd8ef0>



In [30]:

    
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x10dedb9e8>



In [31]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x10e120198>



In [ ]:



In [ ]:



In [32]:

    
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2b.PDR_difference # dependent variable to predict

X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[32]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.149


  Model:                    OLS          Adj. R-squared:        0.136


  Method:              Least Squares     F-statistic:           10.89


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.26e-06


  Time:                  23:50:03        Log-Likelihood:       723.17


  No. Observations:          190         AIC:                  -1438.


  Df Residuals:              186         BIC:                  -1425.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -10.5371      3.337     -3.158   0.002    -17.120    -3.954


  total_cpg_no_filter  -1.983e-08   4.15e-09     -4.782   0.000   -2.8e-08 -1.16e-08


  bsRate_mean             10.7370      3.460      3.104   0.002      3.912    17.562


  avgReadCpG_mean          0.0291      0.011      2.584   0.011      0.007     0.051




  Omnibus:        11.003    Durbin-Watson:         1.935


  Prob(Omnibus):   0.004    Jarque-Bera (JB):     11.304


  Skew:            0.584    Prob(JB):            0.00351


  Kurtosis:        3.252    Cond. No.           6.96e+09



In [33]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x10e489c18>



In [34]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x10e699dd8>



In [35]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x10e8c8780>



In [ ]:



In [36]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3.PDR_difference # dependent variable to predict

X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[36]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.012


  Model:                    OLS          Adj. R-squared:       -0.010


  Method:              Least Squares     F-statistic:          0.5533


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.647 


  Time:                  23:50:06        Log-Likelihood:       240.20


  No. Observations:          136         AIC:                  -472.4


  Df Residuals:              132         BIC:                  -460.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -9.8182     61.115     -0.161   0.873   -130.709   111.073


  total_cpg_no_filter  -7.492e-08   7.65e-08     -0.979   0.329  -2.26e-07  7.64e-08


  bsRate_mean             10.6164     61.269      0.173   0.863   -110.580   131.812


  avgReadCpG_mean         -0.0992      0.079     -1.249   0.214     -0.256     0.058




  Omnibus:        10.120    Durbin-Watson:         1.840


  Prob(Omnibus):   0.006    Jarque-Bera (JB):     11.038


  Skew:            0.688    Prob(JB):            0.00401


  Kurtosis:        2.769    Cond. No.           7.56e+09



In [37]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x10eb25048>



In [38]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x10ed857b8>



In [39]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x10f085f28>



In [ ]:



In [40]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3a.PDR_difference # dependent variable to predict

X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[40]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.078


  Model:                    OLS          Adj. R-squared:        0.066


  Method:              Least Squares     F-statistic:           6.400


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000351


  Time:                  23:50:10        Log-Likelihood:       517.32


  No. Observations:          231         AIC:                  -1027.


  Df Residuals:              227         BIC:                  -1013.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -58.7101     43.375     -1.354   0.177   -144.179    26.759


  total_cpg_no_filter    7.09e-08   3.33e-08      2.130   0.034    5.3e-09  1.36e-07


  bsRate_mean             58.2610     43.449      1.341   0.181    -27.355   143.877


  avgReadCpG_mean          0.0857      0.020      4.249   0.000      0.046     0.125




  Omnibus:        41.518    Durbin-Watson:         1.321


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     58.549


  Skew:            1.115    Prob(JB):           1.93e-13


  Kurtosis:        4.054    Cond. No.           1.03e+10



In [41]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[41]:





<matplotlib.text.Text at 0x10f3bff98>



In [42]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x10f61b128>



In [43]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x10f918240>



In [ ]:



In [44]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3b.PDR_difference # dependent variable to predict

X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[44]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.200


  Model:                    OLS          Adj. R-squared:        0.125


  Method:              Least Squares     F-statistic:           2.660


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0648 


  Time:                  23:50:13        Log-Likelihood:       76.330


  No. Observations:           36         AIC:                  -144.7


  Df Residuals:               32         BIC:                  -138.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -267.9734    252.270     -1.062   0.296   -781.831   245.884


  total_cpg_no_filter  -2.207e-08    3.6e-08     -0.614   0.544  -9.54e-08  5.12e-08


  bsRate_mean            267.6661    252.429      1.060   0.297   -246.514   781.847


  avgReadCpG_mean          0.1455      0.081      1.799   0.081     -0.019     0.310




  Omnibus:         1.832    Durbin-Watson:         1.728


  Prob(Omnibus):   0.400    Jarque-Bera (JB):      1.706


  Skew:            0.450    Prob(JB):              0.426


  Kurtosis:        2.428    Cond. No.           3.29e+10



In [45]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[45]:





<matplotlib.text.Text at 0x10fb77a90>



In [46]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x10fd9fe48>



In [47]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x10febed30>



In [ ]:



In [ ]:



In [48]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3c.PDR_difference # dependent variable to predict

X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[48]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.005


  Model:                    OLS          Adj. R-squared:       -0.011


  Method:              Least Squares     F-statistic:          0.3295


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.804 


  Time:                  23:50:16        Log-Likelihood:       400.88


  No. Observations:          190         AIC:                  -793.8


  Df Residuals:              186         BIC:                  -780.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -21.1774     45.736     -0.463   0.644   -111.406    69.051


  total_cpg_no_filter    3.07e-08   3.76e-08      0.816   0.415  -4.35e-08  1.05e-07


  bsRate_mean             21.1448     45.725      0.462   0.644    -69.062   111.352


  avgReadCpG_mean          0.0183      0.049      0.373   0.709     -0.078     0.115




  Omnibus:        11.949    Durbin-Watson:         1.590


  Prob(Omnibus):   0.003    Jarque-Bera (JB):     13.024


  Skew:            0.627    Prob(JB):            0.00149


  Kurtosis:        2.733    Cond. No.           8.48e+09



In [49]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.text.Text at 0x1102ab3c8>



In [50]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x1104eccc0>



In [51]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x110822588>



In [ ]:



In [ ]:



In [52]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4.PDR_difference # dependent variable to predict

X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[52]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.195


  Model:                    OLS          Adj. R-squared:        0.179


  Method:              Least Squares     F-statistic:           12.01


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  4.36e-07


  Time:                  23:50:20        Log-Likelihood:       418.20


  No. Observations:          153         AIC:                  -828.4


  Df Residuals:              149         BIC:                  -816.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  234.9377     42.901      5.476   0.000    150.165   319.710


  total_cpg_no_filter  -5.267e-08   2.36e-08     -2.228   0.027  -9.94e-08 -5.96e-09


  bsRate_mean           -235.9340     43.113     -5.472   0.000   -321.126  -150.742


  avgReadCpG_mean          0.0550      0.025      2.213   0.028      0.006     0.104




  Omnibus:        50.644    Durbin-Watson:         1.624


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     92.356


  Skew:            1.609    Prob(JB):           8.81e-21


  Kurtosis:        5.034    Cond. No.           1.52e+10



In [53]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[53]:





<matplotlib.text.Text at 0x110a723c8>



In [54]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x110d3c8d0>



In [55]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x111066400>



In [ ]:



In [56]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4a.PDR_difference # dependent variable to predict

X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[56]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.108


  Model:                    OLS          Adj. R-squared:        0.090


  Method:              Least Squares     F-statistic:           5.987


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000702


  Time:                  23:50:24        Log-Likelihood:       365.61


  No. Observations:          153         AIC:                  -723.2


  Df Residuals:              149         BIC:                  -711.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -24.3938     68.215     -0.358   0.721   -159.187   110.400


  total_cpg_no_filter    1.36e-07   4.57e-08      2.978   0.003   4.58e-08  2.26e-07


  bsRate_mean             24.5140     68.405      0.358   0.721   -110.655   159.683


  avgReadCpG_mean         -0.0126      0.046     -0.272   0.786     -0.104     0.079




  Omnibus:        71.909    Durbin-Watson:         0.216


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    179.188


  Skew:            2.083    Prob(JB):           1.23e-39


  Kurtosis:        6.279    Cond. No.           1.55e+10



In [57]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[57]:





<matplotlib.text.Text at 0x1113257b8>



In [58]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x1115f3d30>



In [59]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x1118aae48>



In [ ]:



In [60]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4b.PDR_difference # dependent variable to predict

X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[60]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.150


  Model:                    OLS          Adj. R-squared:        0.131


  Method:              Least Squares     F-statistic:           7.772


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  8.09e-05


  Time:                  23:50:28        Log-Likelihood:       606.00


  No. Observations:          136         AIC:                  -1204.


  Df Residuals:              132         BIC:                  -1192.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    2.0897      7.525      0.278   0.782    -12.796    16.975


  total_cpg_no_filter   2.636e-08   6.51e-09      4.049   0.000   1.35e-08  3.92e-08


  bsRate_mean             -2.2537      7.537     -0.299   0.765    -17.162    12.655


  avgReadCpG_mean          0.0222      0.005      4.544   0.000      0.013     0.032




  Omnibus:        22.529    Durbin-Watson:         1.289


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     28.019


  Skew:            0.995    Prob(JB):           8.24e-07


  Kurtosis:        3.991    Cond. No.           1.28e+10



In [61]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[61]:





<matplotlib.text.Text at 0x111c672b0>



In [62]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x111eac4e0>



In [63]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x1120c2630>



In [ ]:



In [64]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4c.PDR_difference # dependent variable to predict

X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[64]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.238


  Model:                    OLS          Adj. R-squared:        0.227


  Method:              Least Squares     F-statistic:           21.47


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  3.84e-12


  Time:                  23:50:32        Log-Likelihood:       587.23


  No. Observations:          210         AIC:                  -1166.


  Df Residuals:              206         BIC:                  -1153.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -342.8072     65.916     -5.201   0.000   -472.765  -212.850


  total_cpg_no_filter   1.922e-07   2.44e-08      7.874   0.000   1.44e-07   2.4e-07


  bsRate_mean            342.8458     66.019      5.193   0.000    212.686   473.006


  avgReadCpG_mean          0.1074      0.020      5.455   0.000      0.069     0.146




  Omnibus:        65.926    Durbin-Watson:         0.267


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    130.262


  Skew:            1.558    Prob(JB):           5.17e-29


  Kurtosis:        5.275    Cond. No.           3.13e+10



In [65]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[65]:





<matplotlib.text.Text at 0x1124cbf60>



In [66]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x112816898>



In [67]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x112ac09e8>



In [ ]:



In [68]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5.PDR_difference # dependent variable to predict

X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[68]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.098


  Model:                    OLS          Adj. R-squared:        0.085


  Method:              Least Squares     F-statistic:           7.481


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  8.89e-05


  Time:                  23:50:36        Log-Likelihood:       322.46


  No. Observations:          210         AIC:                  -636.9


  Df Residuals:              206         BIC:                  -623.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -334.5515     86.457     -3.870   0.000   -505.005  -164.098


  total_cpg_no_filter  -2.395e-08   2.16e-08     -1.109   0.269  -6.65e-08  1.86e-08


  bsRate_mean            336.0882     86.687      3.877   0.000    165.182   506.995


  avgReadCpG_mean         -0.0987      0.059     -1.684   0.094     -0.214     0.017




  Omnibus:        87.321    Durbin-Watson:         1.967


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     15.226


  Skew:            0.304    Prob(JB):           0.000494


  Kurtosis:        1.829    Cond. No.           1.48e+10



In [69]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[69]:





<matplotlib.text.Text at 0x112db8f60>



In [70]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x11300deb8>



In [71]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x113314940>



In [ ]:



In [72]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5a.PDR_difference # dependent variable to predict

X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[72]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.165


  Model:                    OLS          Adj. R-squared:        0.151


  Method:              Least Squares     F-statistic:           12.25


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.39e-07


  Time:                  23:50:40        Log-Likelihood:       346.02


  No. Observations:          190         AIC:                  -684.0


  Df Residuals:              186         BIC:                  -671.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -242.7377     44.153     -5.498   0.000   -329.842  -155.633


  total_cpg_no_filter   7.437e-09   1.29e-08      0.575   0.566  -1.81e-08  3.29e-08


  bsRate_mean            244.0889     44.331      5.506   0.000    156.633   331.545


  avgReadCpG_mean         -0.1076      0.048     -2.252   0.025     -0.202    -0.013




  Omnibus:        13.462    Durbin-Watson:         1.115


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     14.990


  Skew:            0.686    Prob(JB):           0.000556


  Kurtosis:        2.900    Cond. No.           1.63e+10



In [73]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[73]:





<matplotlib.text.Text at 0x113612588>



In [74]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x113866518>



In [75]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x113b71208>



In [ ]:



In [76]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5b.PDR_difference # dependent variable to predict

X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[76]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.253


  Model:                    OLS          Adj. R-squared:        0.243


  Method:              Least Squares     F-statistic:           25.64


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.54e-14


  Time:                  23:50:45        Log-Likelihood:       403.11


  No. Observations:          231         AIC:                  -798.2


  Df Residuals:              227         BIC:                  -784.4


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -803.3758    111.524     -7.204   0.000  -1023.131  -583.621


  total_cpg_no_filter   3.696e-08   1.68e-08      2.203   0.029    3.9e-09     7e-08


  bsRate_mean            806.1783    111.720      7.216   0.000    586.037  1026.320


  avgReadCpG_mean         -0.2346      0.064     -3.668   0.000     -0.361    -0.109




  Omnibus:        12.498    Durbin-Watson:         1.764


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     11.666


  Skew:            0.490    Prob(JB):            0.00293


  Kurtosis:        2.499    Cond. No.           2.98e+10



In [77]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[77]:





<matplotlib.text.Text at 0x113e362b0>



In [78]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x114057898>



In [79]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x11442d940>



In [ ]:



In [80]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5c.PDR_difference # dependent variable to predict

X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[80]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.055


  Model:                    OLS          Adj. R-squared:        0.039


  Method:              Least Squares     F-statistic:           3.580


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0150 


  Time:                  23:50:51        Log-Likelihood:       315.45


  No. Observations:          190         AIC:                  -622.9


  Df Residuals:              186         BIC:                  -609.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -148.8477     94.399     -1.577   0.117   -335.077    37.382


  total_cpg_no_filter  -1.952e-08   1.81e-08     -1.076   0.283  -5.53e-08  1.63e-08


  bsRate_mean            149.7852     94.629      1.583   0.115    -36.899   336.469


  avgReadCpG_mean         -0.1082      0.046     -2.327   0.021     -0.200    -0.016




  Omnibus:        15.860    Durbin-Watson:         2.501


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      7.241


  Skew:            0.252    Prob(JB):             0.0268


  Kurtosis:        2.187    Cond. No.           1.62e+10



In [81]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[81]:





<matplotlib.text.Text at 0x1146cbcc0>



In [82]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x11490ee48>



In [83]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x114c1b4e0>



In [ ]:



In [84]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6.PDR_difference # dependent variable to predict

X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[84]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.032


  Model:                    OLS          Adj. R-squared:        0.014


  Method:              Least Squares     F-statistic:           1.811


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.147 


  Time:                  23:50:55        Log-Likelihood:       238.08


  No. Observations:          171         AIC:                  -468.2


  Df Residuals:              167         BIC:                  -455.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -9.5415     16.734     -0.570   0.569    -42.579    23.496


  total_cpg_no_filter   1.698e-08    1.8e-07      0.094   0.925  -3.39e-07  3.73e-07


  bsRate_mean             11.6583     17.437      0.669   0.505    -22.767    46.083


  avgReadCpG_mean         -0.2260      0.104     -2.178   0.031     -0.431    -0.021




  Omnibus:        32.537    Durbin-Watson:         1.923


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      9.102


  Skew:            0.242    Prob(JB):             0.0106


  Kurtosis:        1.979    Cond. No.           9.65e+08



In [85]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[85]:





<matplotlib.text.Text at 0x114e99eb8>



In [86]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x114980c50>



In [87]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x11526e978>



In [ ]:



In [88]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6a.PDR_difference # dependent variable to predict

X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[88]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.002


  Model:                    OLS          Adj. R-squared:       -0.014


  Method:              Least Squares     F-statistic:          0.1175


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.950 


  Time:                  23:51:00        Log-Likelihood:       240.67


  No. Observations:          190         AIC:                  -473.3


  Df Residuals:              186         BIC:                  -460.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -18.6419     41.431     -0.450   0.653   -100.378    63.094


  total_cpg_no_filter   -5.84e-09   5.68e-08     -0.103   0.918  -1.18e-07  1.06e-07


  bsRate_mean             19.8881     43.516      0.457   0.648    -65.961   105.737


  avgReadCpG_mean         -0.0539      0.098     -0.551   0.582     -0.247     0.139




  Omnibus:        752.135    Durbin-Watson:         2.033


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.936


  Skew:            0.120     Prob(JB):           0.000210


  Kurtosis:        1.557     Cond. No.           4.47e+09



In [89]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[89]:





<matplotlib.text.Text at 0x1155bfb38>



In [90]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x1158c47f0>



In [91]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x115bbfac8>



In [ ]:



In [92]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6b.PDR_difference # dependent variable to predict

X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[92]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.023


  Model:                    OLS          Adj. R-squared:        0.005


  Method:              Least Squares     F-statistic:           1.284


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.282 


  Time:                  23:51:06        Log-Likelihood:       232.38


  No. Observations:          171         AIC:                  -456.8


  Df Residuals:              167         BIC:                  -444.2


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   69.2598     37.719      1.836   0.068     -5.208   143.728


  total_cpg_no_filter   2.242e-08   4.24e-08      0.529   0.598  -6.13e-08  1.06e-07


  bsRate_mean            -72.8980     39.430     -1.849   0.066   -150.744     4.948


  avgReadCpG_mean          0.1369      0.125      1.099   0.273     -0.109     0.383




  Omnibus:        74.332    Durbin-Watson:         1.019


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.315


  Skew:            0.035    Prob(JB):            0.00576


  Kurtosis:        1.799    Cond. No.           7.17e+09



In [93]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[93]:





<matplotlib.text.Text at 0x115e21710>



In [94]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[94]:





<matplotlib.text.Text at 0x116046c50>



In [95]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x1163555f8>



In [ ]:



In [96]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6c.PDR_difference # dependent variable to predict

X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[96]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.021


  Model:                    OLS          Adj. R-squared:        0.004


  Method:              Least Squares     F-statistic:           1.221


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.304 


  Time:                  23:51:11        Log-Likelihood:       242.78


  No. Observations:          171         AIC:                  -477.6


  Df Residuals:              167         BIC:                  -465.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   47.8031     26.749      1.787   0.076     -5.006   100.612


  total_cpg_no_filter   7.699e-09   3.45e-08      0.223   0.824  -6.04e-08  7.58e-08


  bsRate_mean            -49.6685     28.065     -1.770   0.079   -105.076     5.739


  avgReadCpG_mean          0.0045      0.105      0.043   0.966     -0.204     0.213




  Omnibus:        783.935    Durbin-Watson:         2.113


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.452


  Skew:            0.203     Prob(JB):           0.000268


  Kurtosis:        1.536     Cond. No.           5.35e+09



In [97]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[97]:





<matplotlib.text.Text at 0x1166786d8>



In [98]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.text.Text at 0x1168a2470>



In [99]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x116bb2e48>



In [ ]:



In [100]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6d.PDR_difference # dependent variable to predict

X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)


print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[100]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.074


  Model:                    OLS          Adj. R-squared:        0.057


  Method:              Least Squares     F-statistic:           4.446


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00493


  Time:                  23:51:17        Log-Likelihood:       251.02


  No. Observations:          171         AIC:                  -494.0


  Df Residuals:              167         BIC:                  -481.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -46.3498     24.060     -1.926   0.056    -93.850     1.151


  total_cpg_no_filter  -2.835e-09   4.98e-08     -0.057   0.955  -1.01e-07  9.54e-08


  bsRate_mean             46.5587     25.308      1.840   0.068     -3.406    96.523


  avgReadCpG_mean          0.2439      0.113      2.161   0.032      0.021     0.467




  Omnibus:        48.080    Durbin-Watson:         2.021


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     11.376


  Skew:            0.308    Prob(JB):            0.00339


  Kurtosis:        1.896    Cond. No.           4.19e+09



In [101]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[101]:





<matplotlib.text.Text at 0x116ec7748>



In [102]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[102]:





<matplotlib.text.Text at 0x117105d30>



In [103]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[103]:





<matplotlib.text.Text at 0x116b98a90>



In [ ]:



In [104]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6e.PDR_difference # dependent variable to predict

X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)



print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[104]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.186


  Model:                    OLS          Adj. R-squared:        0.161


  Method:              Least Squares     F-statistic:           7.676


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000113


  Time:                  23:51:23        Log-Likelihood:       161.82


  No. Observations:          105         AIC:                  -315.6


  Df Residuals:              101         BIC:                  -305.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  104.7329     46.883      2.234   0.028     11.729   197.737


  total_cpg_no_filter   1.588e-07   6.84e-08      2.323   0.022   2.32e-08  2.94e-07


  bsRate_mean           -103.9700     48.335     -2.151   0.034   -199.853    -8.087


  avgReadCpG_mean         -0.6915      0.193     -3.587   0.001     -1.074    -0.309




  Omnibus:        12.492    Durbin-Watson:         1.433


  Prob(Omnibus):   0.002    Jarque-Bera (JB):      5.947


  Skew:            0.366    Prob(JB):             0.0511


  Kurtosis:        2.093    Cond. No.           6.03e+09



In [105]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[105]:





<matplotlib.text.Text at 0x1175c42e8>



In [106]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[106]:





<matplotlib.text.Text at 0x11781ac88>



In [107]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[107]:





<matplotlib.text.Text at 0x117a43668>



In [108]:

    
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)









    



(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)



In [109]:

    
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]



In [110]:

    
total_pairs = pd.concat(frames)



In [111]:

    
total_pairs.shape









    Out[111]:





(4035, 15)



In [112]:

    
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)



In [113]:

    
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted






    Out[113]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:              0.321 


  Model:                    OLS          Adj. R-squared:         0.321 


  Method:              Least Squares     F-statistic:            477.0 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):      0.00  


  Time:                  23:51:27        Log-Likelihood:        7074.5 


  No. Observations:         4035         AIC:                -1.414e+04


  Df Residuals:             4030         BIC:                -1.411e+04


  Df Model:                    4                                       


  Covariance Type:       nonrobust                                     




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.6460      0.051     32.445   0.000      1.547     1.745


  total_cpg_no_filter   4.743e-09   3.48e-09      1.365   0.172  -2.07e-09  1.16e-08


  bsRate_mean             -1.5079      0.047    -31.772   0.000     -1.601    -1.415


  avgReadCpG_mean         -0.0167      0.001    -14.471   0.000     -0.019    -0.014


  type_CLL                -0.0630      0.002    -34.649   0.000     -0.067    -0.059




  Omnibus:        345.980    Durbin-Watson:         1.627


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    440.460


  Skew:            0.768     Prob(JB):           2.27e-96


  Kurtosis:        3.508     Cond. No.           5.10e+07



In [114]:

    
X.head()









    Out[114]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
      type_CLL
    
  
  
    
      0
      1
      709541.5
      0.980181
      7.029760
      1.0
    
    
      1
      1
      786284.0
      0.980293
      7.017405
      1.0
    
    
      2
      1
      683623.5
      0.980268
      7.039100
      1.0
    
    
      3
      1
      853200.5
      0.980262
      7.028117
      1.0
    
    
      4
      1
      842686.5
      0.980228
      7.025451
      1.0



In [115]:

    
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



PDR_difference vs. bsRate, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[115]:





<matplotlib.text.Text at 0x117cb1390>



In [116]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[116]:





<matplotlib.text.Text at 0x117f93780>



In [117]:

    
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[117]:





<matplotlib.text.Text at 0x1182a2eb8>



In [118]:

    
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")









    



PDR_difference vs. bsRate, by type! 






    Out[118]:





<matplotlib.text.Text at 0x118598240>



In [119]:

    
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. avgReadCpG per cell, by type! 






    Out[119]:





<matplotlib.text.Text at 0x11860fa90>



In [120]:

    
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. total unique CpG per cel, by type!






    Out[120]:





<matplotlib.text.Text at 0x1187c7e80>



In [121]:

    
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.900956617468



In [122]:

    
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)









    



(990, 15)
(3045, 15)



In [123]:

    
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)



X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(990, 4)






    Out[123]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.013


  Model:                    OLS          Adj. R-squared:        0.010


  Method:              Least Squares     F-statistic:           4.396


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00441


  Time:                  23:54:12        Log-Likelihood:       3925.1


  No. Observations:          990         AIC:                  -7842.


  Df Residuals:              986         BIC:                  -7823.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.0600      0.031      1.944   0.052     -0.001     0.121


  total_cpg_no_filter   3.246e-10   1.24e-09      0.262   0.794  -2.11e-09  2.76e-09


  bsRate_mean             -0.0685      0.028     -2.439   0.015     -0.124    -0.013


  avgReadCpG_mean          0.0018      0.002      0.886   0.376     -0.002     0.006




  Omnibus:        127.782    Durbin-Watson:         2.004


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    181.632


  Skew:            0.942     Prob(JB):           3.62e-40


  Kurtosis:        3.924     Cond. No.           1.66e+08



In [124]:

    
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  CLL pairs ")









    



PDR_difference vs. bsRate, jointplot, CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[124]:





<matplotlib.text.Text at 0x13b124a58>



In [125]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[125]:





<matplotlib.text.Text at 0x22ac0bd30>



In [126]:

    
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell,  CLL pairs")









    



PDR difference vs. total # unique CpGs per cell, CLL pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[126]:





<matplotlib.text.Text at 0x22b0ef438>



In [127]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.876379343843



In [ ]:



In [128]:

    
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)



X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(3045, 4)






    Out[128]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.225 


  Model:                    OLS          Adj. R-squared:        0.224 


  Method:              Least Squares     F-statistic:           294.2 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.12e-167


  Time:                  23:55:04        Log-Likelihood:       4949.8 


  No. Observations:         3045         AIC:                  -9892. 


  Df Residuals:             3041         BIC:                  -9868. 


  Df Model:                    3                                      


  Covariance Type:       nonrobust                                    




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.8487      0.063     29.522   0.000      1.726     1.971


  total_cpg_no_filter  -1.205e-08   4.63e-09     -2.604   0.009  -2.11e-08 -2.98e-09


  bsRate_mean             -1.6890      0.058    -29.047   0.000     -1.803    -1.575


  avgReadCpG_mean         -0.0194      0.001    -14.308   0.000     -0.022    -0.017




  Omnibus:        171.194    Durbin-Watson:         1.660


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    194.262


  Skew:            0.605     Prob(JB):           6.56e-43


  Kurtosis:        2.738     Cond. No.           4.39e+07



In [129]:

    
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")









    



PDR_difference vs. bsRate, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[129]:





<matplotlib.text.Text at 0x118f53dd8>



In [130]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[130]:





<matplotlib.text.Text at 0x1193eada0>



In [131]:

    
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")









    



PDR diference vs. total # unique CpGs per cell, Normal pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[131]:





<matplotlib.text.Text at 0x11981d3c8>



In [132]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.884245143819



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	filename	methylation	PDR_total	methylation_unweighted	PDR_unweighted	thisMeth	mixedReadCount	total_reads	type	bio	protocol	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	bsRate	avgReadCpgs_nofilter	avgReadCpgs_lessthan1CpG	avgReadCpgs_gtreql3.8CpG
0	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC	0.529505	0.235795	0.632802	0.231878	2208325.0	983394.0	4170549.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	525282.0	525251.0	435636.0	0.9975	5.354284	5.355660	7.019255
1	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG	0.455550	0.177631	0.583859	0.175371	733064.0	285841.0	1609185.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	221972.0	221962.0	186757.0	0.9975	5.587294	5.588449	7.302612
2	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG	0.515269	0.177645	0.618578	0.174221	1452802.0	500870.0	2819500.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	355730.0	355713.0	295624.0	0.9975	5.393199	5.394331	7.079288
3	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG	0.556175	0.176367	0.652727	0.172273	2279354.0	722800.0	4098270.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	483179.0	483150.0	397812.0	0.9975	5.287116	5.288477	6.979525
4	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC	0.528642	0.181331	0.640401	0.172287	1394208.0	478231.0	2637340.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	356122.0	356100.0	294065.0	0.9975	5.314302	5.315719	6.995052

Dep. Variable:	PDR_difference	R-squared:	0.047
Model:	OLS	Adj. R-squared:	0.033
Method:	Least Squares	F-statistic:	3.374
Date:	Tue, 09 Aug 2016	Prob (F-statistic):	0.0194
Time:	23:49:50	Log-Likelihood:	884.39
No. Observations:	210	AIC:	-1761.
Df Residuals:	206	BIC:	-1747.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	8.7620	3.468	2.526	0.012	1.925 15.599
total_cpg_no_filter	6.31e-09	3.43e-09	1.842	0.067	-4.45e-10 1.31e-08
bsRate_mean	-9.0390	3.536	-2.556	0.011	-16.011 -2.067
avgReadCpG_mean	0.0140	0.009	1.600	0.111	-0.003 0.031

Omnibus:	11.066	Durbin-Watson:	2.056
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.795
Skew:	0.562	Prob(JB):	0.00275
Kurtosis:	2.707	Cond. No.	1.46e+10

Omnibus:	13.995	Durbin-Watson:	2.461
Prob(Omnibus):	0.001	Jarque-Bera (JB):	9.802
Skew:	0.406	Prob(JB):	0.00744
Kurtosis:	2.322	Cond. No.	1.68e+10

Omnibus:	13.706	Durbin-Watson:	2.228
Prob(Omnibus):	0.001	Jarque-Bera (JB):	10.458
Skew:	0.469	Prob(JB):	0.00536
Kurtosis:	2.335	Cond. No.	3.27e+09

Omnibus:	16.408	Durbin-Watson:	1.723
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.113
Skew:	0.740	Prob(JB):	0.000117
Kurtosis:	3.314	Cond. No.	4.47e+09

Omnibus:	11.003	Durbin-Watson:	1.935
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.304
Skew:	0.584	Prob(JB):	0.00351
Kurtosis:	3.252	Cond. No.	6.96e+09

Omnibus:	10.120	Durbin-Watson:	1.840
Prob(Omnibus):	0.006	Jarque-Bera (JB):	11.038
Skew:	0.688	Prob(JB):	0.00401
Kurtosis:	2.769	Cond. No.	7.56e+09

Omnibus:	41.518	Durbin-Watson:	1.321
Prob(Omnibus):	0.000	Jarque-Bera (JB):	58.549
Skew:	1.115	Prob(JB):	1.93e-13
Kurtosis:	4.054	Cond. No.	1.03e+10

Omnibus:	1.832	Durbin-Watson:	1.728
Prob(Omnibus):	0.400	Jarque-Bera (JB):	1.706
Skew:	0.450	Prob(JB):	0.426
Kurtosis:	2.428	Cond. No.	3.29e+10

Omnibus:	11.949	Durbin-Watson:	1.590
Prob(Omnibus):	0.003	Jarque-Bera (JB):	13.024
Skew:	0.627	Prob(JB):	0.00149
Kurtosis:	2.733	Cond. No.	8.48e+09

Omnibus:	50.644	Durbin-Watson:	1.624
Prob(Omnibus):	0.000	Jarque-Bera (JB):	92.356
Skew:	1.609	Prob(JB):	8.81e-21
Kurtosis:	5.034	Cond. No.	1.52e+10

Omnibus:	71.909	Durbin-Watson:	0.216
Prob(Omnibus):	0.000	Jarque-Bera (JB):	179.188
Skew:	2.083	Prob(JB):	1.23e-39
Kurtosis:	6.279	Cond. No.	1.55e+10

Omnibus:	22.529	Durbin-Watson:	1.289
Prob(Omnibus):	0.000	Jarque-Bera (JB):	28.019
Skew:	0.995	Prob(JB):	8.24e-07
Kurtosis:	3.991	Cond. No.	1.28e+10

Omnibus:	65.926	Durbin-Watson:	0.267
Prob(Omnibus):	0.000	Jarque-Bera (JB):	130.262
Skew:	1.558	Prob(JB):	5.17e-29
Kurtosis:	5.275	Cond. No.	3.13e+10

Omnibus:	87.321	Durbin-Watson:	1.967
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.226
Skew:	0.304	Prob(JB):	0.000494
Kurtosis:	1.829	Cond. No.	1.48e+10

Omnibus:	13.462	Durbin-Watson:	1.115
Prob(Omnibus):	0.001	Jarque-Bera (JB):	14.990
Skew:	0.686	Prob(JB):	0.000556
Kurtosis:	2.900	Cond. No.	1.63e+10

Omnibus:	12.498	Durbin-Watson:	1.764
Prob(Omnibus):	0.002	Jarque-Bera (JB):	11.666
Skew:	0.490	Prob(JB):	0.00293
Kurtosis:	2.499	Cond. No.	2.98e+10

Omnibus:	15.860	Durbin-Watson:	2.501
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.241
Skew:	0.252	Prob(JB):	0.0268
Kurtosis:	2.187	Cond. No.	1.62e+10

Omnibus:	32.537	Durbin-Watson:	1.923
Prob(Omnibus):	0.000	Jarque-Bera (JB):	9.102
Skew:	0.242	Prob(JB):	0.0106
Kurtosis:	1.979	Cond. No.	9.65e+08

Omnibus:	752.135	Durbin-Watson:	2.033
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.936
Skew:	0.120	Prob(JB):	0.000210
Kurtosis:	1.557	Cond. No.	4.47e+09

Omnibus:	74.332	Durbin-Watson:	1.019
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.315
Skew:	0.035	Prob(JB):	0.00576
Kurtosis:	1.799	Cond. No.	7.17e+09

Omnibus:	783.935	Durbin-Watson:	2.113
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.452
Skew:	0.203	Prob(JB):	0.000268
Kurtosis:	1.536	Cond. No.	5.35e+09

Omnibus:	48.080	Durbin-Watson:	2.021
Prob(Omnibus):	0.000	Jarque-Bera (JB):	11.376
Skew:	0.308	Prob(JB):	0.00339
Kurtosis:	1.896	Cond. No.	4.19e+09

Omnibus:	12.492	Durbin-Watson:	1.433
Prob(Omnibus):	0.002	Jarque-Bera (JB):	5.947
Skew:	0.366	Prob(JB):	0.0511
Kurtosis:	2.093	Cond. No.	6.03e+09

Omnibus:	345.980	Durbin-Watson:	1.627
Prob(Omnibus):	0.000	Jarque-Bera (JB):	440.460
Skew:	0.768	Prob(JB):	2.27e-96
Kurtosis:	3.508	Cond. No.	5.10e+07

	const	total_cpg_no_filter	bsRate_mean	avgReadCpG_mean	type_CLL
0	1	709541.5	0.980181	7.029760	1.0
1	1	786284.0	0.980293	7.017405	1.0
2	1	683623.5	0.980268	7.039100	1.0
3	1	853200.5	0.980262	7.028117	1.0
4	1	842686.5	0.980228	7.025451	1.0

Omnibus:	127.782	Durbin-Watson:	2.004
Prob(Omnibus):	0.000	Jarque-Bera (JB):	181.632
Skew:	0.942	Prob(JB):	3.62e-40
Kurtosis:	3.924	Cond. No.	1.66e+08

Omnibus:	171.194	Durbin-Watson:	1.660
Prob(Omnibus):	0.000	Jarque-Bera (JB):	194.262
Skew:	0.605	Prob(JB):	6.56e-43
Kurtosis:	2.738	Cond. No.	4.39e+07