notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
#
# We need a regression model for predicting PDR (highest priority): 
# Variables are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting PDR?



In [3]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.api as sm
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')



In [ ]:



In [4]:

    
normal_cellA_df = pd.read_csv("unweighted_PDR_methyl_RRBS_normal_B.csv")
normal_cellB_df = pd.read_csv("unweighted_PDR_methyl_CD27pcell.csv") 
normal_cellC_df = pd.read_csv("unweighted_PDR_methyl_CD27mcell.csv")
cll_cellA_df = pd.read_csv("unweighted_PDR_methyl_RRBS_cw154.csv")
cll_cellC_df = pd.read_csv("unweighted_PDR_methyl_RRBS_trito_pool.csv")



In [5]:

    
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)  
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]



In [6]:

    
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)  
cpg1["filename"] = cpg1["filename"].str[:40]
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')
normal_cellB_df["type"] = str('normal')
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1) 
normal_cellB_df["bio"] = str('CD27p')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')
cpg2 = pd.read_csv('NormalBCD19pCD27pcell_CpGs.csv')
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)  
cpg2["filename"] = cpg2["filename"].str[:50]
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)  
normal_cellC_df["type"] = str('normal')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]
normal_cellC_df["bio"] = str('CD27m')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')
cpg3 = pd.read_csv("NormalBCD19pCD27mcell_CpGs.csv")
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)  
cpg3["filename"] = cpg3["filename"].str[:50]
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
print(normal_result.shape)
print(normal_result.columns)









    



(304, 9)
Index(['PDR_unweighted', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation_unweighted', 'protocol', 'total_reads',
       'type'],
      dtype='object')



In [7]:

    
normal_result = normal_result[['filename', 'PDR_unweighted', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]



In [8]:

    
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1) 
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 
        'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)  
cpg4["filename"] = cpg4["filename"].str[:51]
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1) 
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1) 
cpg5["filename"] = cpg5["filename"].str[:33]
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
print(cll_result.shape)
print(cll_result.columns)









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



(110, 9)
Index(['PDR_unweighted', 'avgReadCpGs_mean', 'avgReadCpGs_median', 'bio',
       'filename', 'methylation_unweighted', 'protocol', 'total_reads',
       'type'],
      dtype='object')



In [9]:

    
cll_result = cll_result[['filename', 'PDR_unweighted', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)
combined2.head()









    Out[9]:






  
    
      
      filename
      PDR_unweighted
      total_reads
      type
      bio
      protocol
      avgReadCpGs_mean
      avgReadCpGs_median
    
  
  
    
      0
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC
      0.254835
      11894660.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.295301
      4.0
    
    
      1
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG
      0.390562
      3744659.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.285714
      4.0
    
    
      2
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG
      0.266418
      10461874.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.453122
      5.0
    
    
      3
      RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC
      0.265385
      14051.0
      normal
      normal_B
      normal_B_cell_A1_24
      4.950166
      4.0
    
    
      4
      RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG
      0.240201
      21928743.0
      normal
      normal_B
      normal_B_cell_A1_24
      5.366276
      5.0



In [10]:

    
bs = pd.read_table('allStats.txt')
bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)
bs = bs.reset_index(drop=True)
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)
bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)
bs = bs.rename(columns = {'sample':'filename'})



In [11]:

    
merged = pd.merge(combined2, bs, how='inner')
merged = merged.reset_index(drop=True)
merged = merged.rename(columns = {'PDR_unweighted':'PDR'})
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})
# Remove all data points with less than 100k in totcpg 
merged = merged[merged['total_reads'] > 100000]
scattermatrix1 = merged.drop(['filename', 'total_reads', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)
sns.lmplot(x="bsRate", y="PDR",  data=scattermatrix1)
plt.title("PDR vs bisulfite conversion rate")









    Out[11]:





<matplotlib.text.Text at 0x112ef9550>



In [12]:

    
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)
y = scattermatrix7.PDR # dependent variable
print(y.shape)
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)
print(X.shape)

categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
import statsmodels.api as sm
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predictPDR")
print("Variables: Number of unique CpGs per cell, mean Average Read CpG per cell, BS rate, Cll or Normal B")
est.summary()









    



(358,)
(358, 4)
Regression results for CLL 'RRBS_trito_pool1', predictPDR
Variables: Number of unique CpGs per cell, mean Average Read CpG per cell, BS rate, Cll or Normal B






    Out[12]:





OLS Regression Results

  Dep. Variable:            PDR          R-squared:             0.501


  Model:                    OLS          Adj. R-squared:        0.495


  Method:              Least Squares     F-statistic:           88.54


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  4.93e-52


  Time:                  16:04:48        Log-Likelihood:       502.75


  No. Observations:          358         AIC:                  -995.5


  Df Residuals:              353         BIC:                  -976.1


  Df Model:                    4                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 2.2452      0.160     14.058   0.000      1.931     2.559


  avgReadCpGs_mean     -0.1098      0.024     -4.547   0.000     -0.157    -0.062


  Unique_CpGs         1.57e-08   1.32e-08      1.186   0.236  -1.03e-08  4.17e-08


  bsRate               -2.5205      0.202    -12.457   0.000     -2.918    -2.123


  type_CLL              1.1512      0.079     14.555   0.000      0.996     1.307


  type_normal           1.0939      0.081     13.539   0.000      0.935     1.253




  Omnibus:        66.898    Durbin-Watson:         1.066


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     16.145


  Skew:            0.158    Prob(JB):           0.000312


  Kurtosis:        2.009    Cond. No.           6.18e+20



In [13]:

    
tritopool = merged[merged["protocol"] == 'trito_pool_1']
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2))
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR, tritopool.PDR)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs = pd.merge(out, pdr_differences, how='inner')
print(pairs.shape)
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs.PDR_difference # dependent variable
X = pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR")
est.summary()









    



(210, 7)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR






    Out[13]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.041


  Model:                    OLS          Adj. R-squared:        0.027


  Method:              Least Squares     F-statistic:           2.969


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):   0.0329 


  Time:                  16:04:49        Log-Likelihood:       883.79


  No. Observations:          210         AIC:                  -1760.


  Df Residuals:              206         BIC:                  -1746.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 9.7562      3.485      2.800   0.006      2.886    16.626


  avgReadCpGs_mean     -0.0023      0.005     -0.480   0.632     -0.012     0.007


  Unique_CpGs_mean   8.497e-09   3.29e-09      2.586   0.010   2.02e-09   1.5e-08


  bsRate_mean          -9.9427      3.551     -2.800   0.006    -16.944    -2.941




  Omnibus:        11.188    Durbin-Watson:         2.072


  Prob(Omnibus):   0.004    Jarque-Bera (JB):     11.918


  Skew:            0.564    Prob(JB):            0.00258


  Kurtosis:        2.700    Cond. No.           1.72e+10



In [14]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[14]:





<matplotlib.text.Text at 0x10e4a69e8>



In [15]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[15]:





<matplotlib.text.Text at 0x112e75ef0>



In [16]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[16]:





<matplotlib.text.Text at 0x113a400f0>



In [17]:

    
tritopool2 = merged[merged["protocol"] == 'trito_pool_2']
tritopool2 = tritopool2.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopool2A = tritopool2.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool2.filename,2))
out = pd.DataFrame([tritopool2A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool2.PDR, tritopool2.PDR)), tritopool2.filename, tritopool2.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, pdr_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs1a.PDR_difference # dependent variable
X = pairs1a.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool_2', predict \delta PDR")
est.summary()









    



(253, 7)
(253,)
(253, 3)
Regression results for CLL 'RRBS_trito_pool_2', predict \delta PDR






    Out[17]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.080


  Model:                    OLS          Adj. R-squared:        0.069


  Method:              Least Squares     F-statistic:           7.207


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  0.000117


  Time:                  16:04:52        Log-Likelihood:       1014.4


  No. Observations:          253         AIC:                  -2021.


  Df Residuals:              249         BIC:                  -2007.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                11.8178      2.713      4.356   0.000      6.474    17.162


  avgReadCpGs_mean      0.0174      0.005      3.204   0.002      0.007     0.028


  Unique_CpGs_mean   1.039e-09   2.25e-09      0.461   0.645   -3.4e-09  5.48e-09


  bsRate_mean         -12.2030      2.794     -4.368   0.000    -17.705    -6.701




  Omnibus:        10.799    Durbin-Watson:         2.251


  Prob(Omnibus):   0.005    Jarque-Bera (JB):     11.510


  Skew:            0.513    Prob(JB):            0.00317


  Kurtosis:        2.797    Cond. No.           1.24e+10



In [18]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[18]:





<matplotlib.text.Text at 0x113bccf60>



In [19]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[19]:





<matplotlib.text.Text at 0x113d514e0>



In [20]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[20]:





<matplotlib.text.Text at 0x113f8a588>



In [21]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, pdr_differences, how='inner')
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2.PDR_difference # dependent variable
X = pairs2.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR")
est.summary()









    



Regression results for CLL 'cw154_Tris_protease', predict \delta PDR






    Out[21]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.024


  Model:                    OLS          Adj. R-squared:        0.010


  Method:              Least Squares     F-statistic:           1.693


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.170 


  Time:                  16:04:55        Log-Likelihood:       866.30


  No. Observations:          210         AIC:                  -1725.


  Df Residuals:              206         BIC:                  -1711.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -3.8510      1.770     -2.175   0.031     -7.342    -0.360


  avgReadCpGs_mean     -0.0057      0.004     -1.604   0.110     -0.013     0.001


  Unique_CpGs_mean  -2.353e-09   2.53e-09     -0.931   0.353  -7.33e-09  2.63e-09


  bsRate_mean           4.0470      1.853      2.184   0.030      0.394     7.700




  Omnibus:        14.408    Durbin-Watson:         1.946


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     11.824


  Skew:            0.491    Prob(JB):            0.00271


  Kurtosis:        2.378    Cond. No.           3.92e+09



In [22]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x1141c04a8>



In [23]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x1143efd30>



In [24]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[24]:





<matplotlib.text.Text at 0x114560208>



In [25]:

    
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, pdr_differences, how='inner')
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2a.PDR_difference # dependent variable
X = pairs2a.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_cw154_Tris_protease_GR', predict \delta PDR")
est.summary()









    



Regression results for CLL 'RRBS_cw154_Tris_protease_GR', predict \delta PDR






    Out[25]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.095


  Model:                    OLS          Adj. R-squared:        0.082


  Method:              Least Squares     F-statistic:           7.188


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  0.000130


  Time:                  16:04:57        Log-Likelihood:       830.25


  No. Observations:          210         AIC:                  -1652.


  Df Residuals:              206         BIC:                  -1639.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -7.8946      2.163     -3.650   0.000    -12.159    -3.630


  avgReadCpGs_mean      0.0012      0.003      0.337   0.737     -0.006     0.008


  Unique_CpGs_mean  -4.238e-09   2.46e-09     -1.720   0.087   -9.1e-09   6.2e-10


  bsRate_mean           8.2180      2.261      3.634   0.000      3.760    12.676




  Omnibus:        17.100    Durbin-Watson:         1.711


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     18.953


  Skew:            0.723    Prob(JB):           7.66e-05


  Kurtosis:        3.269    Cond. No.           5.05e+09



In [26]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x11485b240>



In [27]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x114ac3208>



In [28]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[28]:





<matplotlib.text.Text at 0x114d24fd0>



In [29]:

    
# cw154_CutSmart_proteinase_K
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, pdr_differences, how='inner')
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2b.PDR_difference # dependent variable
X = pairs2b.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR")
est.summary()









    



Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR






    Out[29]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.782


  Model:                    OLS          Adj. R-squared:        0.779


  Method:              Least Squares     F-statistic:           246.9


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  6.10e-68


  Time:                  16:05:01        Log-Likelihood:       705.34


  No. Observations:          210         AIC:                  -1403.


  Df Residuals:              206         BIC:                  -1389.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -56.1216      3.131    -17.925   0.000    -62.294   -49.949


  avgReadCpGs_mean     -0.0473      0.013     -3.596   0.000     -0.073    -0.021


  Unique_CpGs_mean   -5.93e-08   4.97e-09    -11.920   0.000  -6.91e-08 -4.95e-08


  bsRate_mean          58.5189      3.247     18.023   0.000     52.118    64.920




  Omnibus:         4.936    Durbin-Watson:         1.473


  Prob(Omnibus):   0.085    Jarque-Bera (JB):      4.737


  Skew:           -0.278    Prob(JB):             0.0936


  Kurtosis:        3.481    Cond. No.           5.01e+09



In [30]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[30]:





<matplotlib.text.Text at 0x114fae5c0>



In [31]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x115244fd0>



In [32]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[32]:





<matplotlib.text.Text at 0x115595dd8>



In [33]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, pdr_differences, how='inner')
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3.PDR_difference # dependent variable
X = pairs3.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell1_22', predict \delta PDR")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell1_22', predict \delta PDR






    Out[33]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.003


  Model:                    OLS          Adj. R-squared:       -0.017


  Method:              Least Squares     F-statistic:          0.1431


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.934 


  Time:                  16:05:04        Log-Likelihood:       270.53


  No. Observations:          153         AIC:                  -533.1


  Df Residuals:              149         BIC:                  -520.9


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 2.9291     59.000      0.050   0.960   -113.656   119.514


  avgReadCpGs_mean     -0.0181      0.056     -0.324   0.746     -0.129     0.092


  Unique_CpGs_mean   1.659e-08   4.42e-08      0.375   0.708  -7.08e-08  1.04e-07


  bsRate_mean          -2.7874     59.142     -0.047   0.962   -119.652   114.077




  Omnibus:        12.972    Durbin-Watson:         1.877


  Prob(Omnibus):   0.002    Jarque-Bera (JB):     14.614


  Skew:            0.754    Prob(JB):           0.000671


  Kurtosis:        2.866    Cond. No.           9.28e+09



In [34]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[34]:





<matplotlib.text.Text at 0x11584e978>



In [35]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x115a89c88>



In [36]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27pcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[36]:





<matplotlib.text.Text at 0x115d7c3c8>



In [37]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, pdr_differences, how='inner')
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3a.PDR_difference # dependent variable
X = pairs3a.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell22_34', predict \delta PDR")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell22_34', predict \delta PDR






    Out[37]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.023


  Model:                    OLS          Adj. R-squared:        0.011


  Method:              Least Squares     F-statistic:           1.818


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.145 


  Time:                  16:05:07        Log-Likelihood:       510.68


  No. Observations:          231         AIC:                  -1013.


  Df Residuals:              227         BIC:                  -999.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -38.5653     44.404     -0.869   0.386   -126.062    48.931


  avgReadCpGs_mean      0.0472      0.023      2.082   0.038      0.003     0.092


  Unique_CpGs_mean   1.946e-08    2.7e-08      0.721   0.471  -3.37e-08  7.26e-08


  bsRate_mean          38.4297     44.487      0.864   0.389    -49.231   126.090




  Omnibus:        36.315    Durbin-Watson:         1.460


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     48.149


  Skew:            1.070    Prob(JB):           3.50e-11


  Kurtosis:        3.654    Cond. No.           1.28e+10



In [38]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[38]:





<matplotlib.text.Text at 0x115fd8128>



In [39]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[39]:





<matplotlib.text.Text at 0x116253c88>



In [40]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27pcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[40]:





<matplotlib.text.Text at 0x116549f28>



In [41]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, pdr_differences, how='inner')
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3b.PDR_difference # dependent variable
X = pairs3b.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell45_66', predict \delta PDR")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell45_66', predict \delta PDR






    Out[41]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.206


  Model:                    OLS          Adj. R-squared:        0.131


  Method:              Least Squares     F-statistic:           2.760


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):   0.0582 


  Time:                  16:05:10        Log-Likelihood:       76.464


  No. Observations:           36         AIC:                  -144.9


  Df Residuals:               32         BIC:                  -138.6


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const              -208.2096    218.856     -0.951   0.349   -654.005   237.586


  avgReadCpGs_mean      0.1391      0.075      1.858   0.072     -0.013     0.292


  Unique_CpGs_mean  -1.583e-08    3.1e-08     -0.511   0.613   -7.9e-08  4.73e-08


  bsRate_mean         208.0260    219.094      0.949   0.349   -238.253   654.305




  Omnibus:         1.511    Durbin-Watson:         1.762


  Prob(Omnibus):   0.470    Jarque-Bera (JB):      1.412


  Skew:            0.374    Prob(JB):              0.494


  Kurtosis:        2.383    Cond. No.           3.46e+10



In [42]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[42]:





<matplotlib.text.Text at 0x1167ab080>



In [43]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[43]:





<matplotlib.text.Text at 0x1169a4128>



In [44]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[44]:





<matplotlib.text.Text at 0x116bd2898>



In [45]:

    
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, pdr_differences, how='inner')
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3c.PDR_difference # dependent variable
X = pairs3c.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell67_88', predict \delta PDR")
est.summary()









    



Regression results for Normal B 'NormalBCD19pCD27pcell67_88', predict \delta PDR






    Out[45]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.028


  Model:                    OLS          Adj. R-squared:        0.013


  Method:              Least Squares     F-statistic:           1.944


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.124 


  Time:                  16:05:13        Log-Likelihood:       449.27


  No. Observations:          210         AIC:                  -890.5


  Df Residuals:              206         BIC:                  -877.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                11.9645     42.917      0.279   0.781    -72.649    96.578


  avgReadCpGs_mean     -0.0782      0.045     -1.731   0.085     -0.167     0.011


  Unique_CpGs_mean   2.979e-08   2.52e-08      1.182   0.238  -1.99e-08  7.95e-08


  bsRate_mean         -11.5555     42.944     -0.269   0.788    -96.222    73.111




  Omnibus:        11.646    Durbin-Watson:         1.605


  Prob(Omnibus):   0.003    Jarque-Bera (JB):     12.653


  Skew:            0.595    Prob(JB):            0.00179


  Kurtosis:        2.833    Cond. No.           1.06e+10



In [46]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[46]:





<matplotlib.text.Text at 0x116ed4320>



In [47]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[47]:





<matplotlib.text.Text at 0x117141c18>



In [48]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[48]:





<matplotlib.text.Text at 0x1174f5f60>



In [49]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, pdr_differences, how='inner')
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4.PDR_difference # dependent variable
X = pairs4.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell1_22', predict \delta PDR")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell1_22', predict \delta PDR






    Out[49]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.204


  Model:                    OLS          Adj. R-squared:        0.188


  Method:              Least Squares     F-statistic:           12.75


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  1.84e-07


  Time:                  16:05:15        Log-Likelihood:       419.11


  No. Observations:          153         AIC:                  -830.2


  Df Residuals:              149         BIC:                  -818.1


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               214.0898     37.670      5.683   0.000    139.653   288.527


  avgReadCpGs_mean      0.0589      0.023      2.584   0.011      0.014     0.104


  Unique_CpGs_mean  -2.967e-08   2.13e-08     -1.393   0.166  -7.17e-08  1.24e-08


  bsRate_mean        -214.9677     37.812     -5.685   0.000   -289.684  -140.251




  Omnibus:        50.169    Durbin-Watson:         1.669


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     91.142


  Skew:            1.594    Prob(JB):           1.62e-20


  Kurtosis:        5.034    Cond. No.           1.65e+10



In [50]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell1_22 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[50]:





<matplotlib.text.Text at 0x1177754a8>



In [51]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[51]:





<matplotlib.text.Text at 0x1179e7898>



In [52]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[52]:





<matplotlib.text.Text at 0x117d41160>



In [53]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, pdr_differences, how='inner')
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4a.PDR_difference # dependent variable
X = pairs4a.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR






    Out[53]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.121


  Model:                    OLS          Adj. R-squared:        0.105


  Method:              Least Squares     F-statistic:           7.652


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  8.00e-05


  Time:                  16:05:19        Log-Likelihood:       414.05


  No. Observations:          171         AIC:                  -820.1


  Df Residuals:              167         BIC:                  -807.5


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -22.6677     63.828     -0.355   0.723   -148.682   103.347


  avgReadCpGs_mean     -0.0650      0.046     -1.424   0.156     -0.155     0.025


  Unique_CpGs_mean   7.448e-08   3.01e-08      2.473   0.014    1.5e-08  1.34e-07


  bsRate_mean          23.0609     63.990      0.360   0.719   -103.274   149.395




  Omnibus:        81.382    Durbin-Watson:         0.209


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    217.458


  Skew:            2.118    Prob(JB):           6.02e-48


  Kurtosis:        6.547    Cond. No.           1.91e+10



In [54]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell23_44 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[54]:





<matplotlib.text.Text at 0x118000080>



In [55]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[55]:





<matplotlib.text.Text at 0x1182dbbe0>



In [56]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[56]:





<matplotlib.text.Text at 0x1185c4cf8>



In [57]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, pdr_differences, how='inner')
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4b.PDR_difference # dependent variable
X = pairs4b.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR






    Out[57]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.121


  Model:                    OLS          Adj. R-squared:        0.101


  Method:              Least Squares     F-statistic:           6.070


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  0.000667


  Time:                  16:05:22        Log-Likelihood:       603.73


  No. Observations:          136         AIC:                  -1199.


  Df Residuals:              132         BIC:                  -1188.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                 9.0981      7.462      1.219   0.225     -5.663    23.859


  avgReadCpGs_mean      0.0177      0.004      3.966   0.000      0.009     0.026


  Unique_CpGs_mean   1.624e-08   4.77e-09      3.402   0.001    6.8e-09  2.57e-08


  bsRate_mean          -9.2184      7.481     -1.232   0.220    -24.017     5.580




  Omnibus:        15.593    Durbin-Watson:         1.264


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     17.224


  Skew:            0.829    Prob(JB):           0.000182


  Kurtosis:        3.537    Cond. No.           1.55e+10



In [58]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell45_66 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[58]:





<matplotlib.text.Text at 0x118987400>



In [59]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[59]:





<matplotlib.text.Text at 0x118b89320>



In [60]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[60]:





<matplotlib.text.Text at 0x119064fd0>



In [61]:

    
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, pdr_differences, how='inner')
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4c.PDR_difference # dependent variable
X = pairs4c.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR")
est.summary()









    



Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta PDR






    Out[61]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.102


  Model:                    OLS          Adj. R-squared:        0.088


  Method:              Least Squares     F-statistic:           7.763


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  6.17e-05


  Time:                  16:05:25        Log-Likelihood:       569.91


  No. Observations:          210         AIC:                  -1132.


  Df Residuals:              206         BIC:                  -1118.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                50.6559     37.723      1.343   0.181    -23.717   125.029


  avgReadCpGs_mean      0.0257      0.018      1.468   0.144     -0.009     0.060


  Unique_CpGs_mean   6.067e-08   1.67e-08      3.631   0.000   2.77e-08  9.36e-08


  bsRate_mean         -50.9406     37.824     -1.347   0.180   -125.513    23.632




  Omnibus:        96.596    Durbin-Watson:         0.945


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    274.211


  Skew:            2.082    Prob(JB):           2.86e-60


  Kurtosis:        6.742    Cond. No.           2.04e+10



In [62]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell67_88 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[62]:





<matplotlib.text.Text at 0x1192d25f8>



In [63]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[63]:





<matplotlib.text.Text at 0x119c0fdd8>



In [64]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[64]:





<matplotlib.text.Text at 0x119ed5780>



In [65]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, pdr_differences, how='inner')
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5.PDR_difference # dependent variable
X = pairs5.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_A1_24', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_A1_24', predict \delta PDR






    Out[65]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.004


  Model:                    OLS          Adj. R-squared:       -0.012


  Method:              Least Squares     F-statistic:          0.2577


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.856 


  Time:                  16:05:28        Log-Likelihood:       265.87


  No. Observations:          190         AIC:                  -523.7


  Df Residuals:              186         BIC:                  -510.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -9.3400     16.075     -0.581   0.562    -41.053    22.373


  avgReadCpGs_mean     -0.0341      0.085     -0.399   0.690     -0.202     0.134


  Unique_CpGs_mean    7.15e-08   1.25e-07      0.573   0.567  -1.75e-07  3.18e-07


  bsRate_mean          10.0033     16.813      0.595   0.553    -23.165    43.172




  Omnibus:        66.531    Durbin-Watson:         1.997


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     13.180


  Skew:            0.299    Prob(JB):            0.00137


  Kurtosis:        1.856    Cond. No.           1.17e+09



In [66]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[66]:





<matplotlib.text.Text at 0x11a1b87b8>



In [67]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[67]:





<matplotlib.text.Text at 0x11a4176d8>



In [68]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_A1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[68]:





<matplotlib.text.Text at 0x11a71c198>



In [69]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, pdr_differences, how='inner')
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5a.PDR_difference # dependent variable
X = pairs5a.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_B1_24', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_B1_24', predict \delta PDR






    Out[69]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.005


  Model:                    OLS          Adj. R-squared:       -0.011


  Method:              Least Squares     F-statistic:          0.3304


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.803 


  Time:                  16:05:32        Log-Likelihood:       240.99


  No. Observations:          190         AIC:                  -474.0


  Df Residuals:              186         BIC:                  -461.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -32.0771     43.659     -0.735   0.463   -118.207    54.053


  avgReadCpGs_mean     -0.0779      0.081     -0.962   0.337     -0.238     0.082


  Unique_CpGs_mean  -6.424e-09   4.57e-08     -0.141   0.888  -9.66e-08  8.37e-08


  bsRate_mean          33.9108     45.716      0.742   0.459    -56.278   124.100




  Omnibus:        657.703    Durbin-Watson:         2.035


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     16.626


  Skew:            0.099     Prob(JB):           0.000245


  Kurtosis:        1.564     Cond. No.           5.83e+09



In [70]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[70]:





<matplotlib.text.Text at 0x11a92b7b8>



In [71]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[71]:





<matplotlib.text.Text at 0x11ab86dd8>



In [72]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_B1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[72]:





<matplotlib.text.Text at 0x11ae9a438>



In [73]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, pdr_differences, how='inner')
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5b.PDR_difference # dependent variable
X = pairs5b.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_C1_24', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_C1_24', predict \delta PDR






    Out[73]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.004


  Model:                    OLS          Adj. R-squared:       -0.009


  Method:              Least Squares     F-statistic:          0.3055


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.821 


  Time:                  16:05:35        Log-Likelihood:       321.30


  No. Observations:          231         AIC:                  -634.6


  Df Residuals:              227         BIC:                  -620.8


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                28.3954     33.024      0.860   0.391    -36.677    93.467


  avgReadCpGs_mean      0.0196      0.090      0.217   0.828     -0.158     0.197


  Unique_CpGs_mean   4.244e-09   1.98e-08      0.214   0.831  -3.48e-08  4.33e-08


  bsRate_mean         -29.5285     34.613     -0.853   0.395    -97.732    38.675




  Omnibus:        105.670    Durbin-Watson:         1.095


  Prob(Omnibus):   0.000     Jarque-Bera (JB):     15.595


  Skew:            0.233     Prob(JB):           0.000411


  Kurtosis:        1.815     Cond. No.           9.25e+09



In [74]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[74]:





<matplotlib.text.Text at 0x11b1261d0>



In [75]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[75]:





<matplotlib.text.Text at 0x11b426048>



In [76]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_C1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[76]:





<matplotlib.text.Text at 0x11b66d748>



In [77]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, pdr_differences, how='inner')
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5c.PDR_difference # dependent variable
X = pairs5c.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_D1_24', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_D1_24', predict \delta PDR






    Out[77]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.051


  Model:                    OLS          Adj. R-squared:       -0.038


  Method:              Least Squares     F-statistic:          0.5707


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.638 


  Time:                  16:05:37        Log-Likelihood:       57.162


  No. Observations:           36         AIC:                  -106.3


  Df Residuals:               32         BIC:                  -99.99


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -28.1188     59.801     -0.470   0.641   -149.930    93.693


  avgReadCpGs_mean     -0.1341      0.129     -1.042   0.305     -0.396     0.128


  Unique_CpGs_mean   6.644e-08      6e-08      1.108   0.276  -5.57e-08  1.89e-07


  bsRate_mean          30.0337     62.279      0.482   0.633    -96.825   156.892




  Omnibus:        13.704    Durbin-Watson:         1.597


  Prob(Omnibus):   0.001    Jarque-Bera (JB):      2.866


  Skew:           -0.078    Prob(JB):              0.239


  Kurtosis:        1.627    Cond. No.           7.43e+09



In [78]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_D1_24 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[78]:





<matplotlib.text.Text at 0x11b98feb8>



In [79]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_D1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[79]:





<matplotlib.text.Text at 0x11bc76400>



In [80]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_D1_24")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[80]:





<matplotlib.text.Text at 0x11beaf400>



In [81]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5d = pd.merge(out, pdr_differences, how='inner')
pairs5d = pairs5d.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5d.PDR_difference # dependent variable
X = pairs5d.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_G1_22', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_G1_22', predict \delta PDR






    Out[81]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.061


  Model:                    OLS          Adj. R-squared:        0.044


  Method:              Least Squares     F-statistic:           3.597


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):   0.0148 


  Time:                  16:05:40        Log-Likelihood:       249.80


  No. Observations:          171         AIC:                  -491.6


  Df Residuals:              167         BIC:                  -479.0


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -39.0050     26.517     -1.471   0.143    -91.356    13.346


  avgReadCpGs_mean      0.1447      0.097      1.486   0.139     -0.048     0.337


  Unique_CpGs_mean  -1.569e-08   4.09e-08     -0.384   0.701  -9.63e-08   6.5e-08


  bsRate_mean          39.8947     27.921      1.429   0.155    -15.229    95.018




  Omnibus:        55.378    Durbin-Watson:         1.985


  Prob(Omnibus):   0.000    Jarque-Bera (JB):     12.278


  Skew:            0.332    Prob(JB):            0.00216


  Kurtosis:        1.867    Cond. No.           5.53e+09



In [82]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_G1_22 ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[82]:





<matplotlib.text.Text at 0x11c1a1f28>



In [83]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_G1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[83]:





<matplotlib.text.Text at 0x11c4b7a20>



In [84]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5d, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_G1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[84]:





<matplotlib.text.Text at 0x11c6b6198>



In [85]:

    
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR, normb.PDR)), normb.filename, normb.filename)
stacked = df_ex.stack()
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5e = pd.merge(out, pdr_differences, how='inner')
pairs5e = pairs5e.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5e.PDR_difference # dependent variable
X = pairs5e.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_H1_22', predict \delta PDR")
est.summary()









    



Regression results for 'normal_B_cell_H1_22', predict \delta PDR






    Out[85]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.026


  Model:                    OLS          Adj. R-squared:        0.001


  Method:              Least Squares     F-statistic:           1.035


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):    0.380 


  Time:                  16:05:43        Log-Likelihood:       176.43


  No. Observations:          120         AIC:                  -344.9


  Df Residuals:              116         BIC:                  -333.7


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const               -24.7243     44.228     -0.559   0.577   -112.323    62.874


  avgReadCpGs_mean     -0.1215      0.102     -1.194   0.235     -0.323     0.080


  Unique_CpGs_mean   8.854e-08   6.02e-08      1.471   0.144  -3.07e-08  2.08e-07


  bsRate_mean          26.4297     46.163      0.573   0.568    -65.003   117.862




  Omnibus:        15.833    Durbin-Watson:         1.279


  Prob(Omnibus):   0.000    Jarque-Bera (JB):      9.895


  Skew:            0.556    Prob(JB):            0.00710


  Kurtosis:        2.139    Cond. No.           6.56e+09



In [86]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[86]:





<matplotlib.text.Text at 0x11c9ba780>



In [87]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=pairs5e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[87]:





<matplotlib.text.Text at 0x11cc01c18>



In [88]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=pairs5e, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_H1_22")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[88]:





<matplotlib.text.Text at 0x11ce4a4a8>



In [89]:

    
pairs['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('CLL')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs5d['type'] = str('normal')
pairs5e['type'] = str('normal')

frames22 = [pairs, pairs2, pairs3, pairs4]
total_pairs = pd.concat(frames22)
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)

categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
est.summary()









    



Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR






    Out[89]:





OLS Regression Results

  Dep. Variable:      PDR_difference     R-squared:             0.480 


  Model:                    OLS          Adj. R-squared:        0.477 


  Method:              Least Squares     F-statistic:           166.5 


  Date:              Fri, 08 Jul 2016    Prob (F-statistic):  6.43e-101


  Time:                  16:05:46        Log-Likelihood:       1760.1 


  No. Observations:          726         AIC:                  -3510. 


  Df Residuals:              721         BIC:                  -3487. 


  Df Model:                    4                                      


  Covariance Type:       nonrobust                                    




                      coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const                -1.1603      0.112    -10.343   0.000     -1.381    -0.940


  avgReadCpGs_mean     -0.0203      0.010     -2.053   0.040     -0.040    -0.001


  Unique_CpGs_mean  -4.763e-08    3.5e-09    -13.593   0.000  -5.45e-08 -4.07e-08


  bsRate_mean           1.3032      0.074     17.635   0.000      1.158     1.448


  type_CLL              0.0426      0.003     16.017   0.000      0.037     0.048




  Omnibus:        284.893    Durbin-Watson:         1.829


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   1749.731


  Skew:            1.644     Prob(JB):               0.00


  Kurtosis:        9.858     Cond. No.           9.50e+07



In [90]:

    
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[90]:





<matplotlib.text.Text at 0x11d0c8048>



In [91]:

    
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[91]:





<matplotlib.text.Text at 0x11d38f080>



In [92]:

    
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[92]:





<matplotlib.text.Text at 0x11d696390>



In [93]:

    
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[93]:





<seaborn.axisgrid.FacetGrid at 0x11d914550>



In [94]:

    
sns.lmplot(x="avgReadCpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[94]:





<seaborn.axisgrid.FacetGrid at 0x11da48358>



In [95]:

    
sns.lmplot(x="Unique_CpGs_mean", y="PDR_difference",  data=total_pairs, hue='type')









    Out[95]:





<seaborn.axisgrid.FacetGrid at 0x11db5c710>



In [96]:

    
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting






    



Random Forest model score is 0.902306825835



In [ ]:



In [ ]:

	filename	PDR_unweighted	total_reads	type	bio	protocol	avgReadCpGs_mean	avgReadCpGs_median
0	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC	0.254835	11894660.0	normal	normal_B	normal_B_cell_A1_24	5.295301	4.0
1	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG	0.390562	3744659.0	normal	normal_B	normal_B_cell_A1_24	5.285714	4.0
2	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG	0.266418	10461874.0	normal	normal_B	normal_B_cell_A1_24	5.453122	5.0
3	RRBS_normal_B_cell_A1_24_TAAGGCGA.ACTCAC	0.265385	14051.0	normal	normal_B	normal_B_cell_A1_24	4.950166	4.0
4	RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG	0.240201	21928743.0	normal	normal_B	normal_B_cell_A1_24	5.366276	5.0

Dep. Variable:	PDR	R-squared:	0.501
Model:	OLS	Adj. R-squared:	0.495
Method:	Least Squares	F-statistic:	88.54
Date:	Fri, 08 Jul 2016	Prob (F-statistic):	4.93e-52
Time:	16:04:48	Log-Likelihood:	502.75
No. Observations:	358	AIC:	-995.5
Df Residuals:	353	BIC:	-976.1
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	2.2452	0.160	14.058	0.000	1.931 2.559
avgReadCpGs_mean	-0.1098	0.024	-4.547	0.000	-0.157 -0.062
Unique_CpGs	1.57e-08	1.32e-08	1.186	0.236	-1.03e-08 4.17e-08
bsRate	-2.5205	0.202	-12.457	0.000	-2.918 -2.123
type_CLL	1.1512	0.079	14.555	0.000	0.996 1.307
type_normal	1.0939	0.081	13.539	0.000	0.935 1.253

Omnibus:	66.898	Durbin-Watson:	1.066
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.145
Skew:	0.158	Prob(JB):	0.000312
Kurtosis:	2.009	Cond. No.	6.18e+20

Dep. Variable:	PDR_difference	R-squared:	0.041
Model:	OLS	Adj. R-squared:	0.027
Method:	Least Squares	F-statistic:	2.969
Date:	Fri, 08 Jul 2016	Prob (F-statistic):	0.0329
Time:	16:04:49	Log-Likelihood:	883.79
No. Observations:	210	AIC:	-1760.
Df Residuals:	206	BIC:	-1746.
Df Model:	3
Covariance Type:	nonrobust

Omnibus:	11.188	Durbin-Watson:	2.072
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.918
Skew:	0.564	Prob(JB):	0.00258
Kurtosis:	2.700	Cond. No.	1.72e+10

Omnibus:	10.799	Durbin-Watson:	2.251
Prob(Omnibus):	0.005	Jarque-Bera (JB):	11.510
Skew:	0.513	Prob(JB):	0.00317
Kurtosis:	2.797	Cond. No.	1.24e+10

Omnibus:	14.408	Durbin-Watson:	1.946
Prob(Omnibus):	0.001	Jarque-Bera (JB):	11.824
Skew:	0.491	Prob(JB):	0.00271
Kurtosis:	2.378	Cond. No.	3.92e+09

Omnibus:	17.100	Durbin-Watson:	1.711
Prob(Omnibus):	0.000	Jarque-Bera (JB):	18.953
Skew:	0.723	Prob(JB):	7.66e-05
Kurtosis:	3.269	Cond. No.	5.05e+09

Omnibus:	4.936	Durbin-Watson:	1.473
Prob(Omnibus):	0.085	Jarque-Bera (JB):	4.737
Skew:	-0.278	Prob(JB):	0.0936
Kurtosis:	3.481	Cond. No.	5.01e+09

Omnibus:	12.972	Durbin-Watson:	1.877
Prob(Omnibus):	0.002	Jarque-Bera (JB):	14.614
Skew:	0.754	Prob(JB):	0.000671
Kurtosis:	2.866	Cond. No.	9.28e+09

Omnibus:	36.315	Durbin-Watson:	1.460
Prob(Omnibus):	0.000	Jarque-Bera (JB):	48.149
Skew:	1.070	Prob(JB):	3.50e-11
Kurtosis:	3.654	Cond. No.	1.28e+10

Omnibus:	1.511	Durbin-Watson:	1.762
Prob(Omnibus):	0.470	Jarque-Bera (JB):	1.412
Skew:	0.374	Prob(JB):	0.494
Kurtosis:	2.383	Cond. No.	3.46e+10

Omnibus:	11.646	Durbin-Watson:	1.605
Prob(Omnibus):	0.003	Jarque-Bera (JB):	12.653
Skew:	0.595	Prob(JB):	0.00179
Kurtosis:	2.833	Cond. No.	1.06e+10

Omnibus:	50.169	Durbin-Watson:	1.669
Prob(Omnibus):	0.000	Jarque-Bera (JB):	91.142
Skew:	1.594	Prob(JB):	1.62e-20
Kurtosis:	5.034	Cond. No.	1.65e+10

Omnibus:	81.382	Durbin-Watson:	0.209
Prob(Omnibus):	0.000	Jarque-Bera (JB):	217.458
Skew:	2.118	Prob(JB):	6.02e-48
Kurtosis:	6.547	Cond. No.	1.91e+10

Omnibus:	15.593	Durbin-Watson:	1.264
Prob(Omnibus):	0.000	Jarque-Bera (JB):	17.224
Skew:	0.829	Prob(JB):	0.000182
Kurtosis:	3.537	Cond. No.	1.55e+10

Omnibus:	96.596	Durbin-Watson:	0.945
Prob(Omnibus):	0.000	Jarque-Bera (JB):	274.211
Skew:	2.082	Prob(JB):	2.86e-60
Kurtosis:	6.742	Cond. No.	2.04e+10

Omnibus:	66.531	Durbin-Watson:	1.997
Prob(Omnibus):	0.000	Jarque-Bera (JB):	13.180
Skew:	0.299	Prob(JB):	0.00137
Kurtosis:	1.856	Cond. No.	1.17e+09

Omnibus:	657.703	Durbin-Watson:	2.035
Prob(Omnibus):	0.000	Jarque-Bera (JB):	16.626
Skew:	0.099	Prob(JB):	0.000245
Kurtosis:	1.564	Cond. No.	5.83e+09

Omnibus:	105.670	Durbin-Watson:	1.095
Prob(Omnibus):	0.000	Jarque-Bera (JB):	15.595
Skew:	0.233	Prob(JB):	0.000411
Kurtosis:	1.815	Cond. No.	9.25e+09

Omnibus:	13.704	Durbin-Watson:	1.597
Prob(Omnibus):	0.001	Jarque-Bera (JB):	2.866
Skew:	-0.078	Prob(JB):	0.239
Kurtosis:	1.627	Cond. No.	7.43e+09

Omnibus:	55.378	Durbin-Watson:	1.985
Prob(Omnibus):	0.000	Jarque-Bera (JB):	12.278
Skew:	0.332	Prob(JB):	0.00216
Kurtosis:	1.867	Cond. No.	5.53e+09

Omnibus:	15.833	Durbin-Watson:	1.279
Prob(Omnibus):	0.000	Jarque-Bera (JB):	9.895
Skew:	0.556	Prob(JB):	0.00710
Kurtosis:	2.139	Cond. No.	6.56e+09

Omnibus:	284.893	Durbin-Watson:	1.829
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1749.731
Skew:	1.644	Prob(JB):	0.00
Kurtosis:	9.858	Cond. No.	9.50e+07