In [1]:
%matplotlib inline

In [2]:
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#

In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm

In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")

In [5]:
stats.shape


Out[5]:
(438, 15)

In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [7]:
len(normal)


Out[7]:
336

In [8]:
len(CLL)


Out[8]:
102

In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")

In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)

In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])

In [12]:
cpg_total.shape


Out[12]:
(513, 4)

In [13]:
merged = stats.merge(cpg_total, on="filename")

In [14]:
merged.shape


Out[14]:
(438, 18)

In [15]:
merged.head()


Out[15]:
filename methylation PDR_total methylation_unweighted PDR_unweighted thisMeth mixedReadCount total_reads type bio protocol total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 bsRate avgReadCpgs_nofilter avgReadCpgs_lessthan1CpG avgReadCpgs_gtreql3.8CpG
0 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC 0.529505 0.235795 0.632802 0.231878 2208325.0 983394.0 4170549.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 525282.0 525251.0 435636.0 0.9975 5.354284 5.355660 7.019255
1 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG 0.455550 0.177631 0.583859 0.175371 733064.0 285841.0 1609185.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 221972.0 221962.0 186757.0 0.9975 5.587294 5.588449 7.302612
2 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG 0.515269 0.177645 0.618578 0.174221 1452802.0 500870.0 2819500.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 355730.0 355713.0 295624.0 0.9975 5.393199 5.394331 7.079288
3 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG 0.556175 0.176367 0.652727 0.172273 2279354.0 722800.0 4098270.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 483179.0 483150.0 397812.0 0.9975 5.287116 5.288477 6.979525
4 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC 0.528642 0.181331 0.640401 0.172287 1394208.0 478231.0 2637340.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 356122.0 356100.0 294065.0 0.9975 5.314302 5.315719 6.995052

In [ ]:


In [ ]:


In [16]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)

pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1.PDR_difference # dependent variable to predict

X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[16]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.046
Model: OLS Adj. R-squared: 0.033
Method: Least Squares F-statistic: 3.346
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0201
Time: 21:41:58 Log-Likelihood: 884.34
No. Observations: 210 AIC: -1761.
Df Residuals: 206 BIC: -1747.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 8.9491 3.463 2.584 0.010 2.121 15.777
total_cpg_no_filter 4.547e-09 3.87e-09 1.175 0.241 -3.08e-09 1.22e-08
bsRate_mean -9.2084 3.533 -2.606 0.010 -16.175 -2.242
avgReadCpG_mean 0.0147 0.009 1.574 0.117 -0.004 0.033
Omnibus: 11.112 Durbin-Watson: 2.070
Prob(Omnibus): 0.004 Jarque-Bera (JB): 11.804
Skew: 0.560 Prob(JB): 0.00273
Kurtosis: 2.692 Cond. No. 1.46e+10

In [17]:
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")


PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[17]:
<matplotlib.text.Text at 0x10b8a0eb8>

In [18]:
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[18]:
<matplotlib.text.Text at 0x10bc472b0>

In [19]:
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[19]:
<matplotlib.text.Text at 0x10bdcab70>

In [20]:
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1a.PDR_difference # dependent variable to predict

X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[20]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.045
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 3.218
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0238
Time: 21:42:01 Log-Likelihood: 861.73
No. Observations: 210 AIC: -1715.
Df Residuals: 206 BIC: -1702.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 7.7698 5.477 1.419 0.158 -3.028 18.568
total_cpg_no_filter 1.016e-08 3.65e-09 2.786 0.006 2.97e-09 1.73e-08
bsRate_mean -7.9514 5.670 -1.402 0.162 -19.129 3.226
avgReadCpG_mean -0.0023 0.013 -0.180 0.858 -0.027 0.023
Omnibus: 14.102 Durbin-Watson: 2.436
Prob(Omnibus): 0.001 Jarque-Bera (JB): 10.064
Skew: 0.417 Prob(JB): 0.00653
Kurtosis: 2.327 Cond. No. 2.05e+10

In [21]:
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")


PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[21]:
<matplotlib.text.Text at 0x10ca304a8>

In [22]:
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[22]:
<matplotlib.text.Text at 0x10cc46128>

In [23]:
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[23]:
<matplotlib.text.Text at 0x10cf51e80>

In [ ]:


In [24]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2.PDR_difference # dependent variable to predict

X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[24]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.014
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.8881
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.448
Time: 21:42:05 Log-Likelihood: 780.58
No. Observations: 190 AIC: -1553.
Df Residuals: 186 BIC: -1540.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -2.8995 1.809 -1.603 0.111 -6.468 0.669
total_cpg_no_filter -1.407e-09 2.99e-09 -0.470 0.639 -7.31e-09 4.5e-09
bsRate_mean 3.0408 1.891 1.608 0.110 -0.691 6.772
avgReadCpG_mean -0.0030 0.005 -0.643 0.521 -0.012 0.006
Omnibus: 13.402 Durbin-Watson: 2.209
Prob(Omnibus): 0.001 Jarque-Bera (JB): 11.455
Skew: 0.518 Prob(JB): 0.00326
Kurtosis: 2.390 Cond. No. 3.35e+09

In [25]:
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")


PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[25]:
<matplotlib.text.Text at 0x10ca56e80>

In [26]:
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")


PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[26]:
<matplotlib.text.Text at 0x10d468f28>

In [27]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")


PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<matplotlib.text.Text at 0x10d69e9e8>

In [28]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2a.PDR_difference # dependent variable to predict

X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[28]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.134
Model: OLS Adj. R-squared: 0.120
Method: Least Squares F-statistic: 9.604
Date: Tue, 09 Aug 2016 Prob (F-statistic): 6.30e-06
Time: 21:42:08 Log-Likelihood: 754.84
No. Observations: 190 AIC: -1502.
Df Residuals: 186 BIC: -1489.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -5.1998 2.359 -2.204 0.029 -9.853 -0.546
total_cpg_no_filter -6.633e-09 2.94e-09 -2.254 0.025 -1.24e-08 -8.27e-10
bsRate_mean 5.3602 2.470 2.170 0.031 0.487 10.233
avgReadCpG_mean 0.0108 0.004 2.451 0.015 0.002 0.019
Omnibus: 16.926 Durbin-Watson: 1.740
Prob(Omnibus): 0.000 Jarque-Bera (JB): 18.834
Skew: 0.756 Prob(JB): 8.13e-05
Kurtosis: 3.304 Cond. No. 4.71e+09

In [29]:
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")


PDR_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<matplotlib.text.Text at 0x10d91db00>

In [30]:
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")


PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[30]:
<matplotlib.text.Text at 0x10db24860>

In [31]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")


PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[31]:
<matplotlib.text.Text at 0x10de36a90>

In [ ]:


In [ ]:


In [32]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2b.PDR_difference # dependent variable to predict

X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[32]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.131
Model: OLS Adj. R-squared: 0.117
Method: Least Squares F-statistic: 9.334
Date: Tue, 09 Aug 2016 Prob (F-statistic): 8.86e-06
Time: 21:42:12 Log-Likelihood: 721.12
No. Observations: 190 AIC: -1434.
Df Residuals: 186 BIC: -1421.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -9.8580 3.375 -2.921 0.004 -16.516 -3.200
total_cpg_no_filter -2.227e-08 4.24e-09 -5.251 0.000 -3.06e-08 -1.39e-08
bsRate_mean 10.1443 3.506 2.894 0.004 3.229 17.060
avgReadCpG_mean 0.0183 0.011 1.598 0.112 -0.004 0.041
Omnibus: 13.999 Durbin-Watson: 1.928
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.870
Skew: 0.656 Prob(JB): 0.000590
Kurtosis: 3.398 Cond. No. 6.97e+09

In [33]:
print("PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


PDR_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[33]:
<matplotlib.text.Text at 0x10e0cd5c0>

In [34]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")


PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[34]:
<matplotlib.text.Text at 0x10e2e5518>

In [35]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[35]:
<matplotlib.text.Text at 0x10e519198>

In [ ]:


In [36]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3.PDR_difference # dependent variable to predict

X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[36]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.019
Model: OLS Adj. R-squared: -0.004
Method: Least Squares F-statistic: 0.8348
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.477
Time: 21:42:15 Log-Likelihood: 240.63
No. Observations: 136 AIC: -473.3
Df Residuals: 132 BIC: -461.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 9.8844 62.393 0.158 0.874 -113.535 133.304
total_cpg_no_filter -9.493e-08 7.87e-08 -1.207 0.230 -2.51e-07 6.07e-08
bsRate_mean -9.0554 62.449 -0.145 0.885 -132.585 114.475
avgReadCpG_mean -0.1446 0.093 -1.550 0.123 -0.329 0.040
Omnibus: 9.622 Durbin-Watson: 1.838
Prob(Omnibus): 0.008 Jarque-Bera (JB): 10.406
Skew: 0.665 Prob(JB): 0.00550
Kurtosis: 2.740 Cond. No. 7.73e+09

In [37]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[37]:
<matplotlib.text.Text at 0x10e767898>

In [38]:
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")


PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[38]:
<matplotlib.text.Text at 0x10e9d69b0>

In [39]:
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[39]:
<matplotlib.text.Text at 0x10ecdd940>

In [ ]:


In [40]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3a.PDR_difference # dependent variable to predict

X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[40]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.086
Model: OLS Adj. R-squared: 0.074
Method: Least Squares F-statistic: 7.148
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000131
Time: 21:42:18 Log-Likelihood: 518.36
No. Observations: 231 AIC: -1029.
Df Residuals: 227 BIC: -1015.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -53.4329 42.873 -1.246 0.214 -137.913 31.047
total_cpg_no_filter 7.974e-08 3.37e-08 2.367 0.019 1.34e-08 1.46e-07
bsRate_mean 53.0258 42.955 1.234 0.218 -31.616 137.667
avgReadCpG_mean 0.1018 0.023 4.504 0.000 0.057 0.146
Omnibus: 41.269 Durbin-Watson: 1.288
Prob(Omnibus): 0.000 Jarque-Bera (JB): 58.104
Skew: 1.109 Prob(JB): 2.41e-13
Kurtosis: 4.058 Cond. No. 1.03e+10

In [41]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[41]:
<matplotlib.text.Text at 0x10f037518>

In [42]:
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")


PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[42]:
<matplotlib.text.Text at 0x10f271128>

In [43]:
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[43]:
<matplotlib.text.Text at 0x10f4a5f98>

In [ ]:


In [44]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3b.PDR_difference # dependent variable to predict

X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[44]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.170
Model: OLS Adj. R-squared: 0.092
Method: Least Squares F-statistic: 2.185
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.109
Time: 21:42:21 Log-Likelihood: 75.676
No. Observations: 36 AIC: -143.4
Df Residuals: 32 BIC: -137.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -193.0638 256.757 -0.752 0.458 -716.061 329.933
total_cpg_no_filter -2.863e-08 3.73e-08 -0.768 0.448 -1.05e-07 4.73e-08
bsRate_mean 192.8895 256.980 0.751 0.458 -330.561 716.340
avgReadCpG_mean 0.1321 0.094 1.407 0.169 -0.059 0.323
Omnibus: 1.669 Durbin-Watson: 1.709
Prob(Omnibus): 0.434 Jarque-Bera (JB): 1.582
Skew: 0.442 Prob(JB): 0.453
Kurtosis: 2.478 Cond. No. 3.28e+10

In [45]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[45]:
<matplotlib.text.Text at 0x10f7d7438>

In [46]:
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")


PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[46]:
<matplotlib.text.Text at 0x10f9ff7f0>

In [47]:
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[47]:
<matplotlib.text.Text at 0x10fbe4588>

In [ ]:


In [ ]:


In [48]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3c.PDR_difference # dependent variable to predict

X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[48]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.005
Model: OLS Adj. R-squared: -0.011
Method: Least Squares F-statistic: 0.2850
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.836
Time: 21:42:25 Log-Likelihood: 400.81
No. Observations: 190 AIC: -793.6
Df Residuals: 186 BIC: -780.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -15.7505 46.752 -0.337 0.737 -107.984 76.483
total_cpg_no_filter 2.689e-08 3.69e-08 0.728 0.468 -4.6e-08 9.98e-08
bsRate_mean 15.8050 46.744 0.338 0.736 -76.412 108.022
avgReadCpG_mean 0.0048 0.059 0.081 0.936 -0.112 0.121
Omnibus: 11.921 Durbin-Watson: 1.586
Prob(Omnibus): 0.003 Jarque-Bera (JB): 13.004
Skew: 0.628 Prob(JB): 0.00150
Kurtosis: 2.741 Cond. No. 8.67e+09

In [49]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[49]:
<matplotlib.text.Text at 0x10ff16c18>

In [50]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[50]:
<matplotlib.text.Text at 0x11014f748>

In [51]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[51]:
<matplotlib.text.Text at 0x1104879e8>

In [ ]:


In [ ]:


In [52]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4.PDR_difference # dependent variable to predict

X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[52]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.195
Model: OLS Adj. R-squared: 0.178
Method: Least Squares F-statistic: 12.00
Date: Tue, 09 Aug 2016 Prob (F-statistic): 4.41e-07
Time: 21:42:28 Log-Likelihood: 418.19
No. Observations: 153 AIC: -828.4
Df Residuals: 149 BIC: -816.3
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 221.6233 39.953 5.547 0.000 142.676 300.571
total_cpg_no_filter -4.771e-08 2.51e-08 -1.900 0.059 -9.73e-08 1.92e-09
bsRate_mean -222.5182 40.124 -5.546 0.000 -301.805 -143.232
avgReadCpG_mean 0.0595 0.027 2.207 0.029 0.006 0.113
Omnibus: 50.762 Durbin-Watson: 1.626
Prob(Omnibus): 0.000 Jarque-Bera (JB): 92.796
Skew: 1.610 Prob(JB): 7.07e-21
Kurtosis: 5.045 Cond. No. 1.42e+10

In [53]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[53]:
<matplotlib.text.Text at 0x1106d2710>

In [54]:
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")


PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[54]:
<matplotlib.text.Text at 0x1109a1b00>

In [55]:
print("PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


PDR_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[55]:
<matplotlib.text.Text at 0x110cc6400>

In [ ]:


In [56]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4a.PDR_difference # dependent variable to predict

X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[56]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.108
Model: OLS Adj. R-squared: 0.090
Method: Least Squares F-statistic: 6.029
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000665
Time: 21:42:32 Log-Likelihood: 365.67
No. Observations: 153 AIC: -723.3
Df Residuals: 149 BIC: -711.2
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -24.0830 68.026 -0.354 0.724 -158.504 110.338
total_cpg_no_filter 1.29e-07 4.91e-08 2.630 0.009 3.21e-08 2.26e-07
bsRate_mean 24.2419 68.201 0.355 0.723 -110.524 159.008
avgReadCpG_mean -0.0234 0.054 -0.431 0.667 -0.131 0.084
Omnibus: 71.754 Durbin-Watson: 0.220
Prob(Omnibus): 0.000 Jarque-Bera (JB): 178.319
Skew: 2.080 Prob(JB): 1.90e-39
Kurtosis: 6.266 Cond. No. 1.55e+10

In [57]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[57]:
<matplotlib.text.Text at 0x11103a630>

In [58]:
print("PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")


PDR_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[58]:
<matplotlib.text.Text at 0x111325780>

In [59]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[59]:
<matplotlib.text.Text at 0x11169e550>

In [ ]:


In [60]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4b.PDR_difference # dependent variable to predict

X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[60]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.174
Model: OLS Adj. R-squared: 0.155
Method: Least Squares F-statistic: 9.257
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.34e-05
Time: 21:42:36 Log-Likelihood: 607.92
No. Observations: 136 AIC: -1208.
Df Residuals: 132 BIC: -1196.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 2.2489 7.386 0.304 0.761 -12.361 16.858
total_cpg_no_filter 2.618e-08 6.08e-09 4.304 0.000 1.41e-08 3.82e-08
bsRate_mean -2.3824 7.400 -0.322 0.748 -17.020 12.255
avgReadCpG_mean 0.0233 0.005 5.002 0.000 0.014 0.033
Omnibus: 21.471 Durbin-Watson: 1.299
Prob(Omnibus): 0.000 Jarque-Bera (JB): 26.182
Skew: 0.971 Prob(JB): 2.06e-06
Kurtosis: 3.920 Cond. No. 1.27e+10

In [61]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[61]:
<matplotlib.text.Text at 0x111991cc0>

In [62]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[62]:
<matplotlib.text.Text at 0x111be0668>

In [63]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[63]:
<matplotlib.text.Text at 0x111dfa2b0>

In [ ]:


In [64]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4c.PDR_difference # dependent variable to predict

X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[64]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.240
Model: OLS Adj. R-squared: 0.229
Method: Least Squares F-statistic: 21.71
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.94e-12
Time: 21:42:40 Log-Likelihood: 587.51
No. Observations: 210 AIC: -1167.
Df Residuals: 206 BIC: -1154.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -317.6070 63.998 -4.963 0.000 -443.782 -191.431
total_cpg_no_filter 1.886e-07 2.37e-08 7.953 0.000 1.42e-07 2.35e-07
bsRate_mean 317.7381 64.116 4.956 0.000 191.331 444.145
avgReadCpG_mean 0.1121 0.020 5.511 0.000 0.072 0.152
Omnibus: 65.152 Durbin-Watson: 0.266
Prob(Omnibus): 0.000 Jarque-Bera (JB): 127.989
Skew: 1.542 Prob(JB): 1.61e-28
Kurtosis: 5.262 Cond. No. 3.04e+10

In [65]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[65]:
<matplotlib.text.Text at 0x1121ff8d0>

In [66]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[66]:
<matplotlib.text.Text at 0x1125464a8>

In [67]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[67]:
<matplotlib.text.Text at 0x1127f9780>

In [ ]:


In [68]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5.PDR_difference # dependent variable to predict

X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[68]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.098
Model: OLS Adj. R-squared: 0.085
Method: Least Squares F-statistic: 7.478
Date: Tue, 09 Aug 2016 Prob (F-statistic): 8.92e-05
Time: 21:42:45 Log-Likelihood: 322.46
No. Observations: 210 AIC: -636.9
Df Residuals: 206 BIC: -623.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -334.6772 86.455 -3.871 0.000 -505.126 -164.228
total_cpg_no_filter -2.412e-08 2.16e-08 -1.118 0.265 -6.66e-08 1.84e-08
bsRate_mean 336.2137 86.685 3.879 0.000 165.310 507.117
avgReadCpG_mean -0.0986 0.059 -1.682 0.094 -0.214 0.017
Omnibus: 87.232 Durbin-Watson: 1.967
Prob(Omnibus): 0.000 Jarque-Bera (JB): 15.224
Skew: 0.304 Prob(JB): 0.000494
Kurtosis: 1.829 Cond. No. 1.48e+10

In [69]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[69]:
<matplotlib.text.Text at 0x112afac50>

In [70]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[70]:
<matplotlib.text.Text at 0x112d4a198>

In [71]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[71]:
<matplotlib.text.Text at 0x113124f60>

In [ ]:


In [72]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5a.PDR_difference # dependent variable to predict

X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[72]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.165
Model: OLS Adj. R-squared: 0.151
Method: Least Squares F-statistic: 12.24
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.41e-07
Time: 21:42:48 Log-Likelihood: 346.01
No. Observations: 190 AIC: -684.0
Df Residuals: 186 BIC: -671.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -242.6886 44.154 -5.496 0.000 -329.795 -155.582
total_cpg_no_filter 7.302e-09 1.29e-08 0.566 0.572 -1.81e-08 3.27e-08
bsRate_mean 244.0383 44.332 5.505 0.000 156.580 331.497
avgReadCpG_mean -0.1073 0.048 -2.248 0.026 -0.202 -0.013
Omnibus: 13.477 Durbin-Watson: 1.115
Prob(Omnibus): 0.001 Jarque-Bera (JB): 15.009
Skew: 0.687 Prob(JB): 0.000551
Kurtosis: 2.900 Cond. No. 1.63e+10

In [73]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[73]:
<matplotlib.text.Text at 0x112240ef0>

In [74]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[74]:
<matplotlib.text.Text at 0x1135a29b0>

In [75]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[75]:
<matplotlib.text.Text at 0x1138ac320>

In [ ]:


In [76]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5b.PDR_difference # dependent variable to predict

X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[76]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.253
Model: OLS Adj. R-squared: 0.244
Method: Least Squares F-statistic: 25.69
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.41e-14
Time: 21:42:52 Log-Likelihood: 403.17
No. Observations: 231 AIC: -798.3
Df Residuals: 227 BIC: -784.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -802.7319 111.502 -7.199 0.000 -1022.442 -583.021
total_cpg_no_filter 3.681e-08 1.67e-08 2.201 0.029 3.85e-09 6.98e-08
bsRate_mean 805.5372 111.697 7.212 0.000 585.441 1025.633
avgReadCpG_mean -0.2354 0.064 -3.684 0.000 -0.361 -0.109
Omnibus: 12.491 Durbin-Watson: 1.764
Prob(Omnibus): 0.002 Jarque-Bera (JB): 11.652
Skew: 0.490 Prob(JB): 0.00295
Kurtosis: 2.498 Cond. No. 2.98e+10

In [77]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[77]:
<matplotlib.text.Text at 0x113b46550>

In [78]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[78]:
<matplotlib.text.Text at 0x113d93748>

In [79]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[79]:
<matplotlib.text.Text at 0x114167780>

In [ ]:


In [80]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5c.PDR_difference # dependent variable to predict

X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[80]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.055
Model: OLS Adj. R-squared: 0.040
Method: Least Squares F-statistic: 3.600
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0146
Time: 21:42:55 Log-Likelihood: 315.48
No. Observations: 190 AIC: -623.0
Df Residuals: 186 BIC: -610.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -149.5143 94.421 -1.583 0.115 -335.788 36.759
total_cpg_no_filter -1.958e-08 1.81e-08 -1.081 0.281 -5.53e-08 1.62e-08
bsRate_mean 150.4565 94.652 1.590 0.114 -36.273 337.186
avgReadCpG_mean -0.1089 0.047 -2.340 0.020 -0.201 -0.017
Omnibus: 15.836 Durbin-Watson: 2.501
Prob(Omnibus): 0.000 Jarque-Bera (JB): 7.226
Skew: 0.251 Prob(JB): 0.0270
Kurtosis: 2.187 Cond. No. 1.62e+10

In [81]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[81]:
<matplotlib.text.Text at 0x1143fad68>

In [82]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[82]:
<matplotlib.text.Text at 0x114645f60>

In [83]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[83]:
<matplotlib.text.Text at 0x1149597f0>

In [ ]:


In [84]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6.PDR_difference # dependent variable to predict

X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[84]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.063
Model: OLS Adj. R-squared: 0.046
Method: Least Squares F-statistic: 3.715
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0127
Time: 21:42:59 Log-Likelihood: 240.87
No. Observations: 171 AIC: -473.7
Df Residuals: 167 BIC: -461.2
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -10.4477 16.447 -0.635 0.526 -42.919 22.024
total_cpg_no_filter 1.158e-07 1.76e-07 0.657 0.512 -2.32e-07 4.64e-07
bsRate_mean 12.8866 17.158 0.751 0.454 -20.988 46.761
avgReadCpG_mean -0.3515 0.109 -3.230 0.001 -0.566 -0.137
Omnibus: 22.110 Durbin-Watson: 1.894
Prob(Omnibus): 0.000 Jarque-Bera (JB): 6.588
Skew: 0.089 Prob(JB): 0.0371
Kurtosis: 2.055 Cond. No. 9.64e+08

In [85]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[85]:
<matplotlib.text.Text at 0x114bdb668>

In [86]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[86]:
<matplotlib.text.Text at 0x114e126a0>

In [87]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[87]:
<matplotlib.text.Text at 0x114fa6a58>

In [ ]:


In [88]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6a.PDR_difference # dependent variable to predict

X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[88]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.006
Model: OLS Adj. R-squared: -0.010
Method: Least Squares F-statistic: 0.3493
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.790
Time: 21:43:03 Log-Likelihood: 241.02
No. Observations: 190 AIC: -474.0
Df Residuals: 186 BIC: -461.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -33.3112 43.938 -0.758 0.449 -119.992 53.370
total_cpg_no_filter -1.548e-08 5.69e-08 -0.272 0.786 -1.28e-07 9.69e-08
bsRate_mean 35.2861 46.052 0.766 0.445 -55.566 126.138
avgReadCpG_mean -0.0943 0.094 -0.999 0.319 -0.281 0.092
Omnibus: 626.340 Durbin-Watson: 2.029
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.666
Skew: 0.114 Prob(JB): 0.000240
Kurtosis: 1.567 Cond. No. 4.74e+09

In [89]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[89]:
<matplotlib.text.Text at 0x11520a240>

In [90]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[90]:
<matplotlib.text.Text at 0x115516eb8>

In [91]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[91]:
<matplotlib.text.Text at 0x115819ef0>

In [ ]:


In [92]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6b.PDR_difference # dependent variable to predict

X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[92]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.028
Model: OLS Adj. R-squared: 0.011
Method: Least Squares F-statistic: 1.625
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.186
Time: 21:43:09 Log-Likelihood: 232.89
No. Observations: 171 AIC: -457.8
Df Residuals: 167 BIC: -445.2
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 83.2234 39.882 2.087 0.038 4.485 161.962
total_cpg_no_filter 1.676e-08 4.26e-08 0.394 0.694 -6.73e-08 1.01e-07
bsRate_mean -87.4228 41.729 -2.095 0.038 -169.808 -5.038
avgReadCpG_mean 0.1828 0.123 1.488 0.139 -0.060 0.425
Omnibus: 73.476 Durbin-Watson: 1.027
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.245
Skew: -0.000 Prob(JB): 0.00596
Kurtosis: 1.801 Cond. No. 7.61e+09

In [93]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[93]:
<matplotlib.text.Text at 0x115a4acf8>

In [94]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[94]:
<matplotlib.text.Text at 0x115c9ff60>

In [95]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[95]:
<matplotlib.text.Text at 0x115fa6b38>

In [ ]:


In [96]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6c.PDR_difference # dependent variable to predict

X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[96]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.021
Model: OLS Adj. R-squared: 0.004
Method: Least Squares F-statistic: 1.223
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.303
Time: 21:43:13 Log-Likelihood: 242.78
No. Observations: 171 AIC: -477.6
Df Residuals: 167 BIC: -465.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 47.9533 26.147 1.834 0.068 -3.667 99.574
total_cpg_no_filter 6.938e-09 3.59e-08 0.193 0.847 -6.4e-08 7.79e-08
bsRate_mean -49.8327 27.307 -1.825 0.070 -103.744 4.078
avgReadCpG_mean 0.0074 0.087 0.086 0.932 -0.163 0.178
Omnibus: 744.791 Durbin-Watson: 2.114
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.422
Skew: 0.205 Prob(JB): 0.000272
Kurtosis: 1.538 Cond. No. 5.21e+09

In [97]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[97]:
<matplotlib.text.Text at 0x116294b38>

In [98]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[98]:
<matplotlib.text.Text at 0x1164f4b00>

In [99]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[99]:
<matplotlib.text.Text at 0x1167fe9b0>

In [ ]:


In [100]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6d.PDR_difference # dependent variable to predict

X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[100]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.079
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 4.801
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00311
Time: 21:43:17 Log-Likelihood: 251.52
No. Observations: 171 AIC: -495.0
Df Residuals: 167 BIC: -482.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -23.4274 27.827 -0.842 0.401 -78.366 31.511
total_cpg_no_filter 9.462e-09 5.1e-08 0.186 0.853 -9.12e-08 1.1e-07
bsRate_mean 22.7980 29.393 0.776 0.439 -35.232 80.828
avgReadCpG_mean 0.2995 0.126 2.384 0.018 0.051 0.548
Omnibus: 44.794 Durbin-Watson: 2.026
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.966
Skew: 0.297 Prob(JB): 0.00416
Kurtosis: 1.911 Cond. No. 4.87e+09

In [101]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[101]:
<matplotlib.text.Text at 0x116b3b1d0>

In [102]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[102]:
<matplotlib.text.Text at 0x116d504e0>

In [103]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[103]:
<matplotlib.text.Text at 0x117063f60>

In [ ]:


In [104]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6e.PDR_difference # dependent variable to predict

X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[104]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.126
Model: OLS Adj. R-squared: 0.101
Method: Least Squares F-statistic: 4.875
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00330
Time: 21:43:23 Log-Likelihood: 158.14
No. Observations: 105 AIC: -308.3
Df Residuals: 101 BIC: -297.7
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 72.0957 46.928 1.536 0.128 -20.997 165.188
total_cpg_no_filter 2.255e-07 7.01e-08 3.217 0.002 8.65e-08 3.65e-07
bsRate_mean -72.3757 48.581 -1.490 0.139 -168.747 23.996
avgReadCpG_mean -0.4834 0.213 -2.270 0.025 -0.906 -0.061
Omnibus: 12.572 Durbin-Watson: 1.411
Prob(Omnibus): 0.002 Jarque-Bera (JB): 9.065
Skew: 0.596 Prob(JB): 0.0108
Kurtosis: 2.194 Cond. No. 5.84e+09

In [105]:
print("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")


PDR_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[105]:
<matplotlib.text.Text at 0x1172f0c18>

In [106]:
print("PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")


PDR_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[106]:
<matplotlib.text.Text at 0x117539748>

In [107]:
print("PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")


PDR_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[107]:
<matplotlib.text.Text at 0x117761a58>

In [108]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)


(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)

In [109]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

In [110]:
total_pairs = pd.concat(frames)

In [111]:
total_pairs.shape


Out[111]:
(4035, 15)

In [112]:
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

In [113]:
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted")
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted
Out[113]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.294
Model: OLS Adj. R-squared: 0.294
Method: Least Squares F-statistic: 420.0
Date: Tue, 09 Aug 2016 Prob (F-statistic): 6.78e-303
Time: 21:43:27 Log-Likelihood: 6995.5
No. Observations: 4035 AIC: -1.398e+04
Df Residuals: 4030 BIC: -1.395e+04
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 1.5161 0.059 25.786 0.000 1.401 1.631
total_cpg_no_filter 2.43e-08 3.71e-09 6.551 0.000 1.7e-08 3.16e-08
bsRate_mean -1.2129 0.044 -27.679 0.000 -1.299 -1.127
avgReadCpG_mean -0.0520 0.008 -6.835 0.000 -0.067 -0.037
type_CLL -0.0677 0.002 -37.237 0.000 -0.071 -0.064
Omnibus: 443.674 Durbin-Watson: 1.574
Prob(Omnibus): 0.000 Jarque-Bera (JB): 607.560
Skew: 0.881 Prob(JB): 1.17e-132
Kurtosis: 3.714 Cond. No. 4.97e+07

In [114]:
X.head()


Out[114]:
const total_cpg_no_filter bsRate_mean avgReadCpG_mean type_CLL
0 1 709541.5 0.980181 5.354731 1.0
1 1 786284.0 0.980293 5.367319 1.0
2 1 683623.5 0.980268 5.369633 1.0
3 1 853200.5 0.980262 5.367604 1.0
4 1 842686.5 0.980228 5.374553 1.0

In [115]:
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")


PDR_difference vs. bsRate, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[115]:
<matplotlib.text.Text at 0x117aafb00>

In [116]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")


PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[116]:
<matplotlib.text.Text at 0x117d90ef0>

In [117]:
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[117]:
<matplotlib.text.Text at 0x11807f358>

In [118]:
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")


PDR_difference vs. bsRate, by type! 
Out[118]:
<matplotlib.text.Text at 0x1184555c0>

In [119]:
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")


PDR_difference vs. avgReadCpG per cell, by type! 
Out[119]:
<matplotlib.text.Text at 0x1184ca2b0>

In [120]:
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")


PDR_difference vs. total unique CpG per cel, by type!
Out[120]:
<matplotlib.text.Text at 0x1185c8780>

In [121]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.899985586214

In [122]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)


(990, 15)
(3045, 15)

In [123]:
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(990, 4)
Out[123]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.014
Model: OLS Adj. R-squared: 0.011
Method: Least Squares F-statistic: 4.693
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00293
Time: 21:46:31 Log-Likelihood: 3925.6
No. Observations: 990 AIC: -7843.
Df Residuals: 986 BIC: -7824.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.0578 0.029 1.968 0.049 0.000 0.116
total_cpg_no_filter 1.48e-10 1.25e-09 0.118 0.906 -2.31e-09 2.6e-09
bsRate_mean -0.0679 0.028 -2.418 0.016 -0.123 -0.013
avgReadCpG_mean 0.0026 0.002 1.290 0.197 -0.001 0.007
Omnibus: 127.540 Durbin-Watson: 2.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 181.180
Skew: 0.941 Prob(JB): 4.54e-40
Kurtosis: 3.923 Cond. No. 1.63e+08

In [124]:
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot,  CLL pairs ")


PDR_difference vs. bsRate, jointplot, CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[124]:
<matplotlib.text.Text at 0x117d862b0>

In [125]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")


PDR_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[125]:
<matplotlib.text.Text at 0x22ab6e400>

In [126]:
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell,  CLL pairs")


PDR difference vs. total # unique CpGs per cell, CLL pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[126]:
<matplotlib.text.Text at 0x22aff0ac8>

In [127]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.871840994723

In [ ]:


In [128]:
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR
(3045, 4)
Out[128]:
OLS Regression Results
Dep. Variable: PDR_difference R-squared: 0.182
Model: OLS Adj. R-squared: 0.182
Method: Least Squares F-statistic: 226.2
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.73e-132
Time: 21:47:19 Log-Likelihood: 4868.6
No. Observations: 3045 AIC: -9729.
Df Residuals: 3041 BIC: -9705.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 1.6137 0.071 22.778 0.000 1.475 1.753
total_cpg_no_filter 1.962e-08 4.97e-09 3.946 0.000 9.87e-09 2.94e-08
bsRate_mean -1.2785 0.053 -24.242 0.000 -1.382 -1.175
avgReadCpG_mean -0.0578 0.010 -6.006 0.000 -0.077 -0.039
Omnibus: 231.939 Durbin-Watson: 1.583
Prob(Omnibus): 0.000 Jarque-Bera (JB): 286.202
Skew: 0.748 Prob(JB): 7.11e-63
Kurtosis: 2.857 Cond. No. 4.12e+07

In [129]:
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")


PDR_difference vs. bsRate, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[129]:
<matplotlib.text.Text at 0x118f16278>

In [130]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")


PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[130]:
<matplotlib.text.Text at 0x1192d4240>

In [131]:
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference",  data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")


PDR diference vs. total # unique CpGs per cell, Normal pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[131]:
<matplotlib.text.Text at 0x1197cb748>

In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.883297918148

In [ ]:


In [ ]:


In [ ]:


In [ ]: