notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm



In [4]:

    
stats = pd.read_csv("all_RRBS_statistics_final.csv")



In [5]:

    
stats.shape









    Out[5]:





(438, 15)



In [6]:

    
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]



In [7]:

    
len(normal)









    Out[7]:





336



In [8]:

    
len(CLL)









    Out[8]:





102



In [9]:

    
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")



In [10]:

    
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)



In [11]:

    
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])



In [12]:

    
cpg_total.shape









    Out[12]:





(513, 4)



In [13]:

    
merged = stats.merge(cpg_total, on="filename")



In [14]:

    
merged.shape









    Out[14]:





(438, 18)



In [15]:

    
merged.head()









    Out[15]:






  
    
      
      filename
      methylation
      PDR_total
      methylation_unweighted
      PDR_unweighted
      thisMeth
      mixedReadCount
      total_reads
      type
      bio
      protocol
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      bsRate
      avgReadCpgs_nofilter
      avgReadCpgs_lessthan1CpG
      avgReadCpgs_gtreql3.8CpG
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC
      0.529505
      0.235795
      0.632802
      0.231878
      2208325.0
      983394.0
      4170549.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      525282.0
      525251.0
      435636.0
      0.9975
      5.354284
      5.355660
      7.019255
    
    
      1
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG
      0.455550
      0.177631
      0.583859
      0.175371
      733064.0
      285841.0
      1609185.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      221972.0
      221962.0
      186757.0
      0.9975
      5.587294
      5.588449
      7.302612
    
    
      2
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG
      0.515269
      0.177645
      0.618578
      0.174221
      1452802.0
      500870.0
      2819500.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      355730.0
      355713.0
      295624.0
      0.9975
      5.393199
      5.394331
      7.079288
    
    
      3
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG
      0.556175
      0.176367
      0.652727
      0.172273
      2279354.0
      722800.0
      4098270.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      483179.0
      483150.0
      397812.0
      0.9975
      5.287116
      5.288477
      6.979525
    
    
      4
      RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC
      0.528642
      0.181331
      0.640401
      0.172287
      1394208.0
      478231.0
      2637340.0
      normal
      CD19CD27m
      NormalBCD19pCD27mcell67_88
      356122.0
      356100.0
      294065.0
      0.9975
      5.314302
      5.315719
      6.995052



In [ ]:



In [ ]:



In [16]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)

pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1.PDR_difference # dependent variable to predict

X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[16]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.046


  Model:                    OLS          Adj. R-squared:        0.033


  Method:              Least Squares     F-statistic:           3.346


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0201 


  Time:                  21:41:58        Log-Likelihood:       884.34


  No. Observations:          210         AIC:                  -1761.


  Df Residuals:              206         BIC:                  -1747.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    8.9491      3.463      2.584   0.010      2.121    15.777


  total_cpg_no_filter   4.547e-09   3.87e-09      1.175   0.241  -3.08e-09  1.22e-08


  bsRate_mean             -9.2084      3.533     -2.606   0.010    -16.175    -2.242


  avgReadCpG_mean          0.0147      0.009      1.574   0.117     -0.004     0.033




  Omnibus:        11.112    Durbin-Watson:         2.070


  Prob(Omnibus):   0.004    Jarque-Bera (JB):     11.804


  Skew:            0.560    Prob(JB):            0.00273


  Kurtosis:        2.692    Cond. No.           1.46e+10



In [17]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[17]:





<matplotlib.text.Text at 0x10b8a0eb8>



In [18]:

    
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.text.Text at 0x10bc472b0>



In [19]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[19]:





<matplotlib.text.Text at 0x10bdcab70>



In [20]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1a.PDR_difference # dependent variable to predict

X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[20]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.045


  Model:                    OLS          Adj. R-squared:        0.031


  Method:              Least Squares     F-statistic:           3.218


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0238 


  Time:                  21:42:01        Log-Likelihood:       861.73


  No. Observations:          210         AIC:                  -1715.


  Df Residuals:              206         BIC:                  -1702.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    7.7698      5.477      1.419   0.158     -3.028    18.568


  total_cpg_no_filter   1.016e-08   3.65e-09      2.786   0.006   2.97e-09  1.73e-08


  bsRate_mean             -7.9514      5.670     -1.402   0.162    -19.129     3.226


  avgReadCpG_mean         -0.0023      0.013     -0.180   0.858     -0.027     0.023




  Omnibus:        14.102    Durbin-Watson:         2.436


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     10.064


  Skew:            0.417    Prob(JB):            0.00653


  Kurtosis:        2.327    Cond. No.           2.05e+10



In [21]:

    
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x10ca304a8>



In [22]:

    
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x10cc46128>



In [23]:

    
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x10cf51e80>



In [ ]:



In [24]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2.PDR_difference # dependent variable to predict

X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[24]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.014


  Model:                    OLS          Adj. R-squared:       -0.002


  Method:              Least Squares     F-statistic:          0.8881


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.448 


  Time:                  21:42:05        Log-Likelihood:       780.58


  No. Observations:          190         AIC:                  -1553.


  Df Residuals:              186         BIC:                  -1540.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -2.8995      1.809     -1.603   0.111     -6.468     0.669


  total_cpg_no_filter  -1.407e-09   2.99e-09     -0.470   0.639  -7.31e-09   4.5e-09


  bsRate_mean              3.0408      1.891      1.608   0.110     -0.691     6.772


  avgReadCpG_mean         -0.0030      0.005     -0.643   0.521     -0.012     0.006




  Omnibus:        13.402    Durbin-Watson:         2.209


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     11.455


  Skew:            0.518    Prob(JB):            0.00326


  Kurtosis:        2.390    Cond. No.           3.35e+09



In [25]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[25]:





<matplotlib.text.Text at 0x10ca56e80>



In [26]:

    
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")









    



PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x10d468f28>



In [27]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x10d69e9e8>



In [28]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2a.PDR_difference # dependent variable to predict

X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[28]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.134


  Model:                    OLS          Adj. R-squared:        0.120


  Method:              Least Squares     F-statistic:           9.604


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  6.30e-06


  Time:                  21:42:08        Log-Likelihood:       754.84


  No. Observations:          190         AIC:                  -1502.


  Df Residuals:              186         BIC:                  -1489.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -5.1998      2.359     -2.204   0.029     -9.853    -0.546


  total_cpg_no_filter  -6.633e-09   2.94e-09     -2.254   0.025  -1.24e-08 -8.27e-10


  bsRate_mean              5.3602      2.470      2.170   0.031      0.487    10.233


  avgReadCpG_mean          0.0108      0.004      2.451   0.015      0.002     0.019




  Omnibus:        16.926    Durbin-Watson:         1.740


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     18.834


  Skew:            0.756    Prob(JB):           8.13e-05


  Kurtosis:        3.304    Cond. No.           4.71e+09



In [29]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x10d91db00>



In [30]:

    
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")









    



PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x10db24860>



In [31]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x10de36a90>



In [ ]:



In [ ]:



In [32]:

    
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2b.PDR_difference # dependent variable to predict

X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[32]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.131


  Model:                    OLS          Adj. R-squared:        0.117


  Method:              Least Squares     F-statistic:           9.334


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  8.86e-06


  Time:                  21:42:12        Log-Likelihood:       721.12


  No. Observations:          190         AIC:                  -1434.


  Df Residuals:              186         BIC:                  -1421.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   -9.8580      3.375     -2.921   0.004    -16.516    -3.200


  total_cpg_no_filter  -2.227e-08   4.24e-09     -5.251   0.000  -3.06e-08 -1.39e-08


  bsRate_mean             10.1443      3.506      2.894   0.004      3.229    17.060


  avgReadCpG_mean          0.0183      0.011      1.598   0.112     -0.004     0.041




  Omnibus:        13.999    Durbin-Watson:         1.928


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     14.870


  Skew:            0.656    Prob(JB):           0.000590


  Kurtosis:        3.398    Cond. No.           6.97e+09



In [33]:

    
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[33]:





<matplotlib.text.Text at 0x10e0cd5c0>



In [34]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x10e2e5518>



In [35]:

    
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")









    



PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x10e519198>



In [ ]:



In [36]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3.PDR_difference # dependent variable to predict

X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[36]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.019


  Model:                    OLS          Adj. R-squared:       -0.004


  Method:              Least Squares     F-statistic:          0.8348


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.477 


  Time:                  21:42:15        Log-Likelihood:       240.63


  No. Observations:          136         AIC:                  -473.3


  Df Residuals:              132         BIC:                  -461.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    9.8844     62.393      0.158   0.874   -113.535   133.304


  total_cpg_no_filter  -9.493e-08   7.87e-08     -1.207   0.230  -2.51e-07  6.07e-08


  bsRate_mean             -9.0554     62.449     -0.145   0.885   -132.585   114.475


  avgReadCpG_mean         -0.1446      0.093     -1.550   0.123     -0.329     0.040




  Omnibus:         9.622    Durbin-Watson:         1.838


  Prob(Omnibus):   0.008    Jarque-Bera (JB):     10.406


  Skew:            0.665    Prob(JB):            0.00550


  Kurtosis:        2.740    Cond. No.           7.73e+09



In [37]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[37]:





<matplotlib.text.Text at 0x10e767898>



In [38]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x10e9d69b0>



In [39]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x10ecdd940>



In [ ]:



In [40]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3a.PDR_difference # dependent variable to predict

X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[40]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.086


  Model:                    OLS          Adj. R-squared:        0.074


  Method:              Least Squares     F-statistic:           7.148


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000131


  Time:                  21:42:18        Log-Likelihood:       518.36


  No. Observations:          231         AIC:                  -1029.


  Df Residuals:              227         BIC:                  -1015.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -53.4329     42.873     -1.246   0.214   -137.913    31.047


  total_cpg_no_filter   7.974e-08   3.37e-08      2.367   0.019   1.34e-08  1.46e-07


  bsRate_mean             53.0258     42.955      1.234   0.218    -31.616   137.667


  avgReadCpG_mean          0.1018      0.023      4.504   0.000      0.057     0.146




  Omnibus:        41.269    Durbin-Watson:         1.288


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     58.104


  Skew:            1.109    Prob(JB):           2.41e-13


  Kurtosis:        4.058    Cond. No.           1.03e+10



In [41]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[41]:





<matplotlib.text.Text at 0x10f037518>



In [42]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x10f271128>



In [43]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x10f4a5f98>



In [ ]:



In [44]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3b.PDR_difference # dependent variable to predict

X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[44]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.170


  Model:                    OLS          Adj. R-squared:        0.092


  Method:              Least Squares     F-statistic:           2.185


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.109 


  Time:                  21:42:21        Log-Likelihood:       75.676


  No. Observations:           36         AIC:                  -143.4


  Df Residuals:               32         BIC:                  -137.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -193.0638    256.757     -0.752   0.458   -716.061   329.933


  total_cpg_no_filter  -2.863e-08   3.73e-08     -0.768   0.448  -1.05e-07  4.73e-08


  bsRate_mean            192.8895    256.980      0.751   0.458   -330.561   716.340


  avgReadCpG_mean          0.1321      0.094      1.407   0.169     -0.059     0.323




  Omnibus:         1.669    Durbin-Watson:         1.709


  Prob(Omnibus):   0.434    Jarque-Bera (JB):      1.582


  Skew:            0.442    Prob(JB):              0.453


  Kurtosis:        2.478    Cond. No.           3.28e+10



In [45]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[45]:





<matplotlib.text.Text at 0x10f7d7438>



In [46]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x10f9ff7f0>



In [47]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x10fbe4588>



In [ ]:



In [ ]:



In [48]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3c.PDR_difference # dependent variable to predict

X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[48]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.005


  Model:                    OLS          Adj. R-squared:       -0.011


  Method:              Least Squares     F-statistic:          0.2850


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.836 


  Time:                  21:42:25        Log-Likelihood:       400.81


  No. Observations:          190         AIC:                  -793.6


  Df Residuals:              186         BIC:                  -780.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -15.7505     46.752     -0.337   0.737   -107.984    76.483


  total_cpg_no_filter   2.689e-08   3.69e-08      0.728   0.468   -4.6e-08  9.98e-08


  bsRate_mean             15.8050     46.744      0.338   0.736    -76.412   108.022


  avgReadCpG_mean          0.0048      0.059      0.081   0.936     -0.112     0.121




  Omnibus:        11.921    Durbin-Watson:         1.586


  Prob(Omnibus):   0.003    Jarque-Bera (JB):     13.004


  Skew:            0.628    Prob(JB):            0.00150


  Kurtosis:        2.741    Cond. No.           8.67e+09



In [49]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.text.Text at 0x10ff16c18>



In [50]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x11014f748>



In [51]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x1104879e8>



In [ ]:



In [ ]:



In [52]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4.PDR_difference # dependent variable to predict

X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[52]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.195


  Model:                    OLS          Adj. R-squared:        0.178


  Method:              Least Squares     F-statistic:           12.00


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  4.41e-07


  Time:                  21:42:28        Log-Likelihood:       418.19


  No. Observations:          153         AIC:                  -828.4


  Df Residuals:              149         BIC:                  -816.3


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  221.6233     39.953      5.547   0.000    142.676   300.571


  total_cpg_no_filter  -4.771e-08   2.51e-08     -1.900   0.059  -9.73e-08  1.92e-09


  bsRate_mean           -222.5182     40.124     -5.546   0.000   -301.805  -143.232


  avgReadCpG_mean          0.0595      0.027      2.207   0.029      0.006     0.113




  Omnibus:        50.762    Durbin-Watson:         1.626


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     92.796


  Skew:            1.610    Prob(JB):           7.07e-21


  Kurtosis:        5.045    Cond. No.           1.42e+10



In [53]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[53]:





<matplotlib.text.Text at 0x1106d2710>



In [54]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x1109a1b00>



In [55]:

    
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")









    



PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x110cc6400>



In [ ]:



In [56]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4a.PDR_difference # dependent variable to predict

X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[56]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.108


  Model:                    OLS          Adj. R-squared:        0.090


  Method:              Least Squares     F-statistic:           6.029


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  0.000665


  Time:                  21:42:32        Log-Likelihood:       365.67


  No. Observations:          153         AIC:                  -723.3


  Df Residuals:              149         BIC:                  -711.2


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -24.0830     68.026     -0.354   0.724   -158.504   110.338


  total_cpg_no_filter    1.29e-07   4.91e-08      2.630   0.009   3.21e-08  2.26e-07


  bsRate_mean             24.2419     68.201      0.355   0.723   -110.524   159.008


  avgReadCpG_mean         -0.0234      0.054     -0.431   0.667     -0.131     0.084




  Omnibus:        71.754    Durbin-Watson:         0.220


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    178.319


  Skew:            2.080    Prob(JB):           1.90e-39


  Kurtosis:        6.266    Cond. No.           1.55e+10



In [57]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[57]:





<matplotlib.text.Text at 0x11103a630>



In [58]:

    
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x111325780>



In [59]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x11169e550>



In [ ]:



In [60]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4b.PDR_difference # dependent variable to predict

X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[60]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.174


  Model:                    OLS          Adj. R-squared:        0.155


  Method:              Least Squares     F-statistic:           9.257


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.34e-05


  Time:                  21:42:36        Log-Likelihood:       607.92


  No. Observations:          136         AIC:                  -1208.


  Df Residuals:              132         BIC:                  -1196.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    2.2489      7.386      0.304   0.761    -12.361    16.858


  total_cpg_no_filter   2.618e-08   6.08e-09      4.304   0.000   1.41e-08  3.82e-08


  bsRate_mean             -2.3824      7.400     -0.322   0.748    -17.020    12.255


  avgReadCpG_mean          0.0233      0.005      5.002   0.000      0.014     0.033




  Omnibus:        21.471    Durbin-Watson:         1.299


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     26.182


  Skew:            0.971    Prob(JB):           2.06e-06


  Kurtosis:        3.920    Cond. No.           1.27e+10



In [61]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[61]:





<matplotlib.text.Text at 0x111991cc0>



In [62]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x111be0668>



In [63]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x111dfa2b0>



In [ ]:



In [64]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4c.PDR_difference # dependent variable to predict

X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[64]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.240


  Model:                    OLS          Adj. R-squared:        0.229


  Method:              Least Squares     F-statistic:           21.71


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.94e-12


  Time:                  21:42:40        Log-Likelihood:       587.51


  No. Observations:          210         AIC:                  -1167.


  Df Residuals:              206         BIC:                  -1154.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -317.6070     63.998     -4.963   0.000   -443.782  -191.431


  total_cpg_no_filter   1.886e-07   2.37e-08      7.953   0.000   1.42e-07  2.35e-07


  bsRate_mean            317.7381     64.116      4.956   0.000    191.331   444.145


  avgReadCpG_mean          0.1121      0.020      5.511   0.000      0.072     0.152




  Omnibus:        65.152    Durbin-Watson:         0.266


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    127.989


  Skew:            1.542    Prob(JB):           1.61e-28


  Kurtosis:        5.262    Cond. No.           3.04e+10



In [65]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[65]:





<matplotlib.text.Text at 0x1121ff8d0>



In [66]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x1125464a8>



In [67]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x1127f9780>



In [ ]:



In [68]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5.PDR_difference # dependent variable to predict

X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[68]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.098


  Model:                    OLS          Adj. R-squared:        0.085


  Method:              Least Squares     F-statistic:           7.478


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  8.92e-05


  Time:                  21:42:45        Log-Likelihood:       322.46


  No. Observations:          210         AIC:                  -636.9


  Df Residuals:              206         BIC:                  -623.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -334.6772     86.455     -3.871   0.000   -505.126  -164.228


  total_cpg_no_filter  -2.412e-08   2.16e-08     -1.118   0.265  -6.66e-08  1.84e-08


  bsRate_mean            336.2137     86.685      3.879   0.000    165.310   507.117


  avgReadCpG_mean         -0.0986      0.059     -1.682   0.094     -0.214     0.017




  Omnibus:        87.232    Durbin-Watson:         1.967


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     15.224


  Skew:            0.304    Prob(JB):           0.000494


  Kurtosis:        1.829    Cond. No.           1.48e+10



In [69]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[69]:





<matplotlib.text.Text at 0x112afac50>



In [70]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x112d4a198>



In [71]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x113124f60>



In [ ]:



In [72]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5a.PDR_difference # dependent variable to predict

X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[72]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.165


  Model:                    OLS          Adj. R-squared:        0.151


  Method:              Least Squares     F-statistic:           12.24


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.41e-07


  Time:                  21:42:48        Log-Likelihood:       346.01


  No. Observations:          190         AIC:                  -684.0


  Df Residuals:              186         BIC:                  -671.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -242.6886     44.154     -5.496   0.000   -329.795  -155.582


  total_cpg_no_filter   7.302e-09   1.29e-08      0.566   0.572  -1.81e-08  3.27e-08


  bsRate_mean            244.0383     44.332      5.505   0.000    156.580   331.497


  avgReadCpG_mean         -0.1073      0.048     -2.248   0.026     -0.202    -0.013




  Omnibus:        13.477    Durbin-Watson:         1.115


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     15.009


  Skew:            0.687    Prob(JB):           0.000551


  Kurtosis:        2.900    Cond. No.           1.63e+10



In [73]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[73]:





<matplotlib.text.Text at 0x112240ef0>



In [74]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x1135a29b0>



In [75]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x1138ac320>



In [ ]:



In [76]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5b.PDR_difference # dependent variable to predict

X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[76]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.253


  Model:                    OLS          Adj. R-squared:        0.244


  Method:              Least Squares     F-statistic:           25.69


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  2.41e-14


  Time:                  21:42:52        Log-Likelihood:       403.17


  No. Observations:          231         AIC:                  -798.3


  Df Residuals:              227         BIC:                  -784.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -802.7319    111.502     -7.199   0.000  -1022.442  -583.021


  total_cpg_no_filter   3.681e-08   1.67e-08      2.201   0.029   3.85e-09  6.98e-08


  bsRate_mean            805.5372    111.697      7.212   0.000    585.441  1025.633


  avgReadCpG_mean         -0.2354      0.064     -3.684   0.000     -0.361    -0.109




  Omnibus:        12.491    Durbin-Watson:         1.764


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     11.652


  Skew:            0.490    Prob(JB):            0.00295


  Kurtosis:        2.498    Cond. No.           2.98e+10



In [77]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[77]:





<matplotlib.text.Text at 0x113b46550>



In [78]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x113d93748>



In [79]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x114167780>



In [ ]:



In [80]:

    
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5c.PDR_difference # dependent variable to predict

X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[80]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.055


  Model:                    OLS          Adj. R-squared:        0.040


  Method:              Least Squares     F-statistic:           3.600


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0146 


  Time:                  21:42:55        Log-Likelihood:       315.48


  No. Observations:          190         AIC:                  -623.0


  Df Residuals:              186         BIC:                  -610.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 -149.5143     94.421     -1.583   0.115   -335.788    36.759


  total_cpg_no_filter  -1.958e-08   1.81e-08     -1.081   0.281  -5.53e-08  1.62e-08


  bsRate_mean            150.4565     94.652      1.590   0.114    -36.273   337.186


  avgReadCpG_mean         -0.1089      0.047     -2.340   0.020     -0.201    -0.017




  Omnibus:        15.836    Durbin-Watson:         2.501


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      7.226


  Skew:            0.251    Prob(JB):             0.0270


  Kurtosis:        2.187    Cond. No.           1.62e+10



In [81]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[81]:





<matplotlib.text.Text at 0x1143fad68>



In [82]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x114645f60>



In [83]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x1149597f0>



In [ ]:



In [84]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6.PDR_difference # dependent variable to predict

X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[84]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.063


  Model:                    OLS          Adj. R-squared:        0.046


  Method:              Least Squares     F-statistic:           3.715


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.0127 


  Time:                  21:42:59        Log-Likelihood:       240.87


  No. Observations:          171         AIC:                  -473.7


  Df Residuals:              167         BIC:                  -461.2


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -10.4477     16.447     -0.635   0.526    -42.919    22.024


  total_cpg_no_filter   1.158e-07   1.76e-07      0.657   0.512  -2.32e-07  4.64e-07


  bsRate_mean             12.8866     17.158      0.751   0.454    -20.988    46.761


  avgReadCpG_mean         -0.3515      0.109     -3.230   0.001     -0.566    -0.137




  Omnibus:        22.110    Durbin-Watson:         1.894


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      6.588


  Skew:            0.089    Prob(JB):             0.0371


  Kurtosis:        2.055    Cond. No.           9.64e+08



In [85]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[85]:





<matplotlib.text.Text at 0x114bdb668>



In [86]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x114e126a0>



In [87]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x114fa6a58>



In [ ]:



In [88]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6a.PDR_difference # dependent variable to predict

X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[88]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.006


  Model:                    OLS          Adj. R-squared:       -0.010


  Method:              Least Squares     F-statistic:          0.3493


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.790 


  Time:                  21:43:03        Log-Likelihood:       241.02


  No. Observations:          190         AIC:                  -474.0


  Df Residuals:              186         BIC:                  -461.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -33.3112     43.938     -0.758   0.449   -119.992    53.370


  total_cpg_no_filter  -1.548e-08   5.69e-08     -0.272   0.786  -1.28e-07  9.69e-08


  bsRate_mean             35.2861     46.052      0.766   0.445    -55.566   126.138


  avgReadCpG_mean         -0.0943      0.094     -0.999   0.319     -0.281     0.092




  Omnibus:        626.340    Durbin-Watson:         2.029


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.666


  Skew:            0.114     Prob(JB):           0.000240


  Kurtosis:        1.567     Cond. No.           4.74e+09



In [89]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[89]:





<matplotlib.text.Text at 0x11520a240>



In [90]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x115516eb8>



In [91]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x115819ef0>



In [ ]:



In [92]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6b.PDR_difference # dependent variable to predict

X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[92]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.028


  Model:                    OLS          Adj. R-squared:        0.011


  Method:              Least Squares     F-statistic:           1.625


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.186 


  Time:                  21:43:09        Log-Likelihood:       232.89


  No. Observations:          171         AIC:                  -457.8


  Df Residuals:              167         BIC:                  -445.2


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   83.2234     39.882      2.087   0.038      4.485   161.962


  total_cpg_no_filter   1.676e-08   4.26e-08      0.394   0.694  -6.73e-08  1.01e-07


  bsRate_mean            -87.4228     41.729     -2.095   0.038   -169.808    -5.038


  avgReadCpG_mean          0.1828      0.123      1.488   0.139     -0.060     0.425




  Omnibus:        73.476    Durbin-Watson:         1.027


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.245


  Skew:           -0.000    Prob(JB):            0.00596


  Kurtosis:        1.801    Cond. No.           7.61e+09



In [93]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[93]:





<matplotlib.text.Text at 0x115a4acf8>



In [94]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[94]:





<matplotlib.text.Text at 0x115c9ff60>



In [95]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[95]:





<matplotlib.text.Text at 0x115fa6b38>



In [ ]:



In [96]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6c.PDR_difference # dependent variable to predict

X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[96]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.021


  Model:                    OLS          Adj. R-squared:        0.004


  Method:              Least Squares     F-statistic:           1.223


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):    0.303 


  Time:                  21:43:13        Log-Likelihood:       242.78


  No. Observations:          171         AIC:                  -477.6


  Df Residuals:              167         BIC:                  -465.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   47.9533     26.147      1.834   0.068     -3.667    99.574


  total_cpg_no_filter   6.938e-09   3.59e-08      0.193   0.847   -6.4e-08  7.79e-08


  bsRate_mean            -49.8327     27.307     -1.825   0.070   -103.744     4.078


  avgReadCpG_mean          0.0074      0.087      0.086   0.932     -0.163     0.178




  Omnibus:        744.791    Durbin-Watson:         2.114


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.422


  Skew:            0.205     Prob(JB):           0.000272


  Kurtosis:        1.538     Cond. No.           5.21e+09



In [97]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[97]:





<matplotlib.text.Text at 0x116294b38>



In [98]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[98]:





<matplotlib.text.Text at 0x1164f4b00>



In [99]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[99]:





<matplotlib.text.Text at 0x1167fe9b0>



In [ ]:



In [100]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6d.PDR_difference # dependent variable to predict

X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[100]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.079


  Model:                    OLS          Adj. R-squared:        0.063


  Method:              Least Squares     F-statistic:           4.801


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00311


  Time:                  21:43:17        Log-Likelihood:       251.52


  No. Observations:          171         AIC:                  -495.0


  Df Residuals:              167         BIC:                  -482.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                  -23.4274     27.827     -0.842   0.401    -78.366    31.511


  total_cpg_no_filter   9.462e-09    5.1e-08      0.186   0.853  -9.12e-08   1.1e-07


  bsRate_mean             22.7980     29.393      0.776   0.439    -35.232    80.828


  avgReadCpG_mean          0.2995      0.126      2.384   0.018      0.051     0.548




  Omnibus:        44.794    Durbin-Watson:         2.026


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     10.966


  Skew:            0.297    Prob(JB):            0.00416


  Kurtosis:        1.911    Cond. No.           4.87e+09



In [101]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[101]:





<matplotlib.text.Text at 0x116b3b1d0>



In [102]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[102]:





<matplotlib.text.Text at 0x116d504e0>



In [103]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[103]:





<matplotlib.text.Text at 0x117063f60>



In [ ]:



In [104]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6e.PDR_difference # dependent variable to predict

X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()









    



15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')







    Out[104]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.126


  Model:                    OLS          Adj. R-squared:        0.101


  Method:              Least Squares     F-statistic:           4.875


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00330


  Time:                  21:43:23        Log-Likelihood:       158.14


  No. Observations:          105         AIC:                  -308.3


  Df Residuals:              101         BIC:                  -297.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                   72.0957     46.928      1.536   0.128    -20.997   165.188


  total_cpg_no_filter   2.255e-07   7.01e-08      3.217   0.002   8.65e-08  3.65e-07


  bsRate_mean            -72.3757     48.581     -1.490   0.139   -168.747    23.996


  avgReadCpG_mean         -0.4834      0.213     -2.270   0.025     -0.906    -0.061




  Omnibus:        12.572    Durbin-Watson:         1.411


  Prob(Omnibus):   0.002    Jarque-Bera (JB):      9.065


  Skew:            0.596    Prob(JB):             0.0108


  Kurtosis:        2.194    Cond. No.           5.84e+09



In [105]:

    
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[105]:





<matplotlib.text.Text at 0x1172f0c18>



In [106]:

    
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[106]:





<matplotlib.text.Text at 0x117539748>



In [107]:

    
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")









    



PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[107]:





<matplotlib.text.Text at 0x117761a58>



In [108]:

    
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)









    



(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)



In [109]:

    
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]



In [110]:

    
total_pairs = pd.concat(frames)



In [111]:

    
total_pairs.shape









    Out[111]:





(4035, 15)



In [112]:

    
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)



In [113]:

    
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted






    Out[113]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:              0.294 


  Model:                    OLS          Adj. R-squared:         0.294 


  Method:              Least Squares     F-statistic:            420.0 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   6.78e-303


  Time:                  21:43:27        Log-Likelihood:        6995.5 


  No. Observations:         4035         AIC:                -1.398e+04


  Df Residuals:             4030         BIC:                -1.395e+04


  Df Model:                    4                                       


  Covariance Type:       nonrobust                                     




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.5161      0.059     25.786   0.000      1.401     1.631


  total_cpg_no_filter    2.43e-08   3.71e-09      6.551   0.000    1.7e-08  3.16e-08


  bsRate_mean             -1.2129      0.044    -27.679   0.000     -1.299    -1.127


  avgReadCpG_mean         -0.0520      0.008     -6.835   0.000     -0.067    -0.037


  type_CLL                -0.0677      0.002    -37.237   0.000     -0.071    -0.064




  Omnibus:        443.674    Durbin-Watson:         1.574 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    607.560 


  Skew:            0.881     Prob(JB):           1.17e-132


  Kurtosis:        3.714     Cond. No.           4.97e+07



In [114]:

    
X.head()









    Out[114]:






  
    
      
      const
      total_cpg_no_filter
      bsRate_mean
      avgReadCpG_mean
      type_CLL
    
  
  
    
      0
      1
      709541.5
      0.980181
      5.354731
      1.0
    
    
      1
      1
      786284.0
      0.980293
      5.367319
      1.0
    
    
      2
      1
      683623.5
      0.980268
      5.369633
      1.0
    
    
      3
      1
      853200.5
      0.980262
      5.367604
      1.0
    
    
      4
      1
      842686.5
      0.980228
      5.374553
      1.0



In [115]:

    
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



PDR_difference vs. bsRate, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[115]:





<matplotlib.text.Text at 0x117aafb00>



In [116]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[116]:





<matplotlib.text.Text at 0x117d90ef0>



In [117]:

    
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[117]:





<matplotlib.text.Text at 0x11807f358>



In [118]:

    
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")









    



PDR_difference vs. bsRate, by type! 






    Out[118]:





<matplotlib.text.Text at 0x1184555c0>



In [119]:

    
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. avgReadCpG per cell, by type! 






    Out[119]:





<matplotlib.text.Text at 0x1184ca2b0>



In [120]:

    
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")









    



PDR_difference vs. total unique CpG per cel, by type!






    Out[120]:





<matplotlib.text.Text at 0x1185c8780>



In [121]:

    
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.899985586214



In [122]:

    
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)









    



(990, 15)
(3045, 15)



In [123]:

    
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(990, 4)






    Out[123]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.014


  Model:                    OLS          Adj. R-squared:        0.011


  Method:              Least Squares     F-statistic:           4.693


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):   0.00293


  Time:                  21:46:31        Log-Likelihood:       3925.6


  No. Observations:          990         AIC:                  -7843.


  Df Residuals:              986         BIC:                  -7824.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    0.0578      0.029      1.968   0.049      0.000     0.116


  total_cpg_no_filter    1.48e-10   1.25e-09      0.118   0.906  -2.31e-09   2.6e-09


  bsRate_mean             -0.0679      0.028     -2.418   0.016     -0.123    -0.013


  avgReadCpG_mean          0.0026      0.002      1.290   0.197     -0.001     0.007




  Omnibus:        127.540    Durbin-Watson:         2.003


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    181.180


  Skew:            0.941     Prob(JB):           4.54e-40


  Kurtosis:        3.923     Cond. No.           1.63e+08



In [124]:

    
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  CLL pairs ")









    



PDR_difference vs. bsRate, jointplot, CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[124]:





<matplotlib.text.Text at 0x117d862b0>



In [125]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[125]:





<matplotlib.text.Text at 0x22ab6e400>



In [126]:

    
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell,  CLL pairs")









    



PDR difference vs. total # unique CpGs per cell, CLL pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[126]:





<matplotlib.text.Text at 0x22aff0ac8>



In [127]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.871840994723



In [ ]:



In [128]:

    
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(3045, 4)






    Out[128]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.182 


  Model:                    OLS          Adj. R-squared:        0.182 


  Method:              Least Squares     F-statistic:           226.2 


  Date:              Tue, 09 Aug 2016    Prob (F-statistic):  1.73e-132


  Time:                  21:47:19        Log-Likelihood:       4868.6 


  No. Observations:         3045         AIC:                  -9729. 


  Df Residuals:             3041         BIC:                  -9705. 


  Df Model:                    3                                      


  Covariance Type:       nonrobust                                    




                         coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                    1.6137      0.071     22.778   0.000      1.475     1.753


  total_cpg_no_filter   1.962e-08   4.97e-09      3.946   0.000   9.87e-09  2.94e-08


  bsRate_mean             -1.2785      0.053    -24.242   0.000     -1.382    -1.175


  avgReadCpG_mean         -0.0578      0.010     -6.006   0.000     -0.077    -0.039




  Omnibus:        231.939    Durbin-Watson:         1.583


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    286.202


  Skew:            0.748     Prob(JB):           7.11e-63


  Kurtosis:        2.857     Cond. No.           4.12e+07



In [129]:

    
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")









    



PDR_difference vs. bsRate, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[129]:





<matplotlib.text.Text at 0x118f16278>



In [130]:

    
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")









    



PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[130]:





<matplotlib.text.Text at 0x1192d4240>



In [131]:

    
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")









    



PDR diference vs. total # unique CpGs per cell, Normal pairs






    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[131]:





<matplotlib.text.Text at 0x1197cb748>



In [132]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.883297918148



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	filename	methylation	PDR_total	methylation_unweighted	PDR_unweighted	thisMeth	mixedReadCount	total_reads	type	bio	protocol	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	bsRate	avgReadCpgs_nofilter	avgReadCpgs_lessthan1CpG	avgReadCpgs_gtreql3.8CpG
0	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC	0.529505	0.235795	0.632802	0.231878	2208325.0	983394.0	4170549.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	525282.0	525251.0	435636.0	0.9975	5.354284	5.355660	7.019255
1	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG	0.455550	0.177631	0.583859	0.175371	733064.0	285841.0	1609185.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	221972.0	221962.0	186757.0	0.9975	5.587294	5.588449	7.302612
2	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG	0.515269	0.177645	0.618578	0.174221	1452802.0	500870.0	2819500.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	355730.0	355713.0	295624.0	0.9975	5.393199	5.394331	7.079288
3	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG	0.556175	0.176367	0.652727	0.172273	2279354.0	722800.0	4098270.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	483179.0	483150.0	397812.0	0.9975	5.287116	5.288477	6.979525
4	RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC	0.528642	0.181331	0.640401	0.172287	1394208.0	478231.0	2637340.0	normal	CD19CD27m	NormalBCD19pCD27mcell67_88	356122.0	356100.0	294065.0	0.9975	5.314302	5.315719	6.995052

Dep. Variable:	PDR_difference	R-squared:	0.046
Model:	OLS	Adj. R-squared:	0.033
Method:	Least Squares	F-statistic:	3.346
Date:	Tue, 09 Aug 2016	Prob (F-statistic):	0.0201
Time:	21:41:58	Log-Likelihood:	884.34
No. Observations:	210	AIC:	-1761.
Df Residuals:	206	BIC:	-1747.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	8.9491	3.463	2.584	0.010	2.121 15.777
total_cpg_no_filter	4.547e-09	3.87e-09	1.175	0.241	-3.08e-09 1.22e-08
bsRate_mean	-9.2084	3.533	-2.606	0.010	-16.175 -2.242
avgReadCpG_mean	0.0147	0.009	1.574	0.117	-0.004 0.033

Omnibus:	11.112	Durbin-Watson:	2.070
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.804
Skew:	0.560	Prob(JB):	0.00273
Kurtosis:	2.692	Cond. No.	1.46e+10

Omnibus:	14.102	Durbin-Watson:	2.436
Prob(Omnibus):	0.001	Jarque-Bera (JB):	10.064
Skew:	0.417	Prob(JB):	0.00653
Kurtosis:	2.327	Cond. No.	2.05e+10

Omnibus:	13.402	Durbin-Watson:	2.209
Prob(Omnibus):	0.001	Jarque-Bera (JB):	11.455
Skew:	0.518	Prob(JB):	0.00326
Kurtosis:	2.390	Cond. No.	3.35e+09

Omnibus:	16.926	Durbin-Watson:	1.740
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.834
Skew:	0.756	Prob(JB):	8.13e-05
Kurtosis:	3.304	Cond. No.	4.71e+09

Omnibus:	13.999	Durbin-Watson:	1.928
Prob(Omnibus):	0.001	Jarque-Bera (JB):	14.870
Skew:	0.656	Prob(JB):	0.000590
Kurtosis:	3.398	Cond. No.	6.97e+09

Omnibus:	9.622	Durbin-Watson:	1.838
Prob(Omnibus):	0.008	Jarque-Bera (JB):	10.406
Skew:	0.665	Prob(JB):	0.00550
Kurtosis:	2.740	Cond. No.	7.73e+09

Omnibus:	41.269	Durbin-Watson:	1.288
Prob(Omnibus):	0.000	Jarque-Bera (JB):	58.104
Skew:	1.109	Prob(JB):	2.41e-13
Kurtosis:	4.058	Cond. No.	1.03e+10

Omnibus:	1.669	Durbin-Watson:	1.709
Prob(Omnibus):	0.434	Jarque-Bera (JB):	1.582
Skew:	0.442	Prob(JB):	0.453
Kurtosis:	2.478	Cond. No.	3.28e+10

Omnibus:	11.921	Durbin-Watson:	1.586
Prob(Omnibus):	0.003	Jarque-Bera (JB):	13.004
Skew:	0.628	Prob(JB):	0.00150
Kurtosis:	2.741	Cond. No.	8.67e+09

Omnibus:	50.762	Durbin-Watson:	1.626
Prob(Omnibus):	0.000	Jarque-Bera (JB):	92.796
Skew:	1.610	Prob(JB):	7.07e-21
Kurtosis:	5.045	Cond. No.	1.42e+10

Omnibus:	71.754	Durbin-Watson:	0.220
Prob(Omnibus):	0.000	Jarque-Bera (JB):	178.319
Skew:	2.080	Prob(JB):	1.90e-39
Kurtosis:	6.266	Cond. No.	1.55e+10

Omnibus:	21.471	Durbin-Watson:	1.299
Prob(Omnibus):	0.000	Jarque-Bera (JB):	26.182
Skew:	0.971	Prob(JB):	2.06e-06
Kurtosis:	3.920	Cond. No.	1.27e+10

Omnibus:	65.152	Durbin-Watson:	0.266
Prob(Omnibus):	0.000	Jarque-Bera (JB):	127.989
Skew:	1.542	Prob(JB):	1.61e-28
Kurtosis:	5.262	Cond. No.	3.04e+10

Omnibus:	87.232	Durbin-Watson:	1.967
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.224
Skew:	0.304	Prob(JB):	0.000494
Kurtosis:	1.829	Cond. No.	1.48e+10

Omnibus:	13.477	Durbin-Watson:	1.115
Prob(Omnibus):	0.001	Jarque-Bera (JB):	15.009
Skew:	0.687	Prob(JB):	0.000551
Kurtosis:	2.900	Cond. No.	1.63e+10

Omnibus:	12.491	Durbin-Watson:	1.764
Prob(Omnibus):	0.002	Jarque-Bera (JB):	11.652
Skew:	0.490	Prob(JB):	0.00295
Kurtosis:	2.498	Cond. No.	2.98e+10

Omnibus:	15.836	Durbin-Watson:	2.501
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.226
Skew:	0.251	Prob(JB):	0.0270
Kurtosis:	2.187	Cond. No.	1.62e+10

Omnibus:	22.110	Durbin-Watson:	1.894
Prob(Omnibus):	0.000	Jarque-Bera (JB):	6.588
Skew:	0.089	Prob(JB):	0.0371
Kurtosis:	2.055	Cond. No.	9.64e+08

Omnibus:	626.340	Durbin-Watson:	2.029
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.666
Skew:	0.114	Prob(JB):	0.000240
Kurtosis:	1.567	Cond. No.	4.74e+09

Omnibus:	73.476	Durbin-Watson:	1.027
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.245
Skew:	-0.000	Prob(JB):	0.00596
Kurtosis:	1.801	Cond. No.	7.61e+09

Omnibus:	744.791	Durbin-Watson:	2.114
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.422
Skew:	0.205	Prob(JB):	0.000272
Kurtosis:	1.538	Cond. No.	5.21e+09

Omnibus:	44.794	Durbin-Watson:	2.026
Prob(Omnibus):	0.000	Jarque-Bera (JB):	10.966
Skew:	0.297	Prob(JB):	0.00416
Kurtosis:	1.911	Cond. No.	4.87e+09

Omnibus:	12.572	Durbin-Watson:	1.411
Prob(Omnibus):	0.002	Jarque-Bera (JB):	9.065
Skew:	0.596	Prob(JB):	0.0108
Kurtosis:	2.194	Cond. No.	5.84e+09

Omnibus:	443.674	Durbin-Watson:	1.574
Prob(Omnibus):	0.000	Jarque-Bera (JB):	607.560
Skew:	0.881	Prob(JB):	1.17e-132
Kurtosis:	3.714	Cond. No.	4.97e+07

	const	total_cpg_no_filter	bsRate_mean	avgReadCpG_mean	type_CLL
0	1	709541.5	0.980181	5.354731	1.0
1	1	786284.0	0.980293	5.367319	1.0
2	1	683623.5	0.980268	5.369633	1.0
3	1	853200.5	0.980262	5.367604	1.0
4	1	842686.5	0.980228	5.374553	1.0

Omnibus:	127.540	Durbin-Watson:	2.003
Prob(Omnibus):	0.000	Jarque-Bera (JB):	181.180
Skew:	0.941	Prob(JB):	4.54e-40
Kurtosis:	3.923	Cond. No.	1.63e+08

Omnibus:	231.939	Durbin-Watson:	1.583
Prob(Omnibus):	0.000	Jarque-Bera (JB):	286.202
Skew:	0.748	Prob(JB):	7.11e-63
Kurtosis:	2.857	Cond. No.	4.12e+07