In [1]:
%matplotlib inline

In [2]:
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#

In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm

In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")

In [5]:
stats.shape


Out[5]:
(438, 15)

In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [7]:
len(normal)


Out[7]:
336

In [8]:
len(CLL)


Out[8]:
102

In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")

In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)

In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])

In [12]:
cpg_total.shape


Out[12]:
(513, 4)

In [13]:
merged = stats.merge(cpg_total, on="filename")

In [14]:
merged.shape


Out[14]:
(438, 18)

In [15]:
merged.head()


Out[15]:
filename methylation PDR_total methylation_unweighted PDR_unweighted thisMeth mixedReadCount total_reads type bio protocol total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 bsRate avgReadCpgs_nofilter avgReadCpgs_lessthan1CpG avgReadCpgs_gtreql3.8CpG
0 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC 0.529505 0.235795 0.632802 0.231878 2208325.0 983394.0 4170549.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 525282.0 525251.0 435636.0 0.9975 5.354284 5.355660 7.019255
1 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG 0.455550 0.177631 0.583859 0.175371 733064.0 285841.0 1609185.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 221972.0 221962.0 186757.0 0.9975 5.587294 5.588449 7.302612
2 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG 0.515269 0.177645 0.618578 0.174221 1452802.0 500870.0 2819500.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 355730.0 355713.0 295624.0 0.9975 5.393199 5.394331 7.079288
3 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG 0.556175 0.176367 0.652727 0.172273 2279354.0 722800.0 4098270.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 483179.0 483150.0 397812.0 0.9975 5.287116 5.288477 6.979525
4 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC 0.528642 0.181331 0.640401 0.172287 1394208.0 478231.0 2637340.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 356122.0 356100.0 294065.0 0.9975 5.314302 5.315719 6.995052

In [16]:
sns.lmplot(x="bsRate", y="methylation",  data=merged)
plt.title("methylation (weighted) vs bisulfite conversion rate")


Out[16]:
<matplotlib.text.Text at 0x10d7cf160>

In [17]:
sns.lmplot(x="bsRate", y="methylation_unweighted",  data=merged)
plt.title("methylation (unweighted) vs bisulfite conversion rate")


Out[17]:
<matplotlib.text.Text at 0x10d898198>

In [18]:
sns.lmplot(x="bsRate", y="PDR_total",  data=merged)
plt.title("PDR (weighted) vs bisulfite conversion rate")


Out[18]:
<matplotlib.text.Text at 0x10dabaef0>

In [19]:
sns.lmplot(x="bsRate", y="PDR_unweighted",  data=merged)
plt.title("PDR (unweighted) vs bisulfite conversion rate")


Out[19]:
<matplotlib.text.Text at 0x10dbc4c18>

In [20]:
sns.lmplot(x="bsRate", y="total_cpg_no_filter",  data=merged)
plt.title("Total # unique CpGs per cell (no filter) vs bisulfite conversion rate")


Out[20]:
<matplotlib.text.Text at 0x10e4fdb00>

In [21]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan1",  data=merged)
plt.title("Total # unique CpGs per cell (filter: > 1) vs bisulfite conversion rate")


Out[21]:
<matplotlib.text.Text at 0x10e603cf8>

In [22]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan38",  data=merged)
plt.title("Total # unique CpGs per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[22]:
<matplotlib.text.Text at 0x10e7096d8>

In [23]:
sns.lmplot(x="bsRate", y="avgReadCpgs_nofilter",  data=merged)
plt.title("Mean avgCpG read per cell (no filter) vs bisulfite conversion rate")


Out[23]:
<matplotlib.text.Text at 0x10e895a20>

In [24]:
sns.lmplot(x="bsRate", y="avgReadCpgs_lessthan1CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: > 1) vs bisulfite conversion rate")


Out[24]:
<matplotlib.text.Text at 0x10e8fcac8>

In [25]:
sns.lmplot(x="bsRate", y="avgReadCpgs_gtreql3.8CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[25]:
<matplotlib.text.Text at 0x10eb23400>

In [ ]:


In [ ]:


In [26]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1.methylation_difference # dependent variable to predict

X = pairs1.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[26]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.051
Model: OLS Adj. R-squared: 0.037
Method: Least Squares F-statistic: 3.694
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0127
Time: 18:36:00 Log-Likelihood: 634.72
No. Observations: 210 AIC: -1261.
Df Residuals: 206 BIC: -1248.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -30.3253 11.387 -2.663 0.008 -52.775 -7.875
total_cpg_no_filter -3.279e-08 1.12e-08 -2.915 0.004 -5.5e-08 -1.06e-08
bsRate_mean 30.6098 11.611 2.636 0.009 7.719 53.501
avgReadCpG_mean 0.0516 0.029 1.793 0.075 -0.005 0.108
Omnibus: 35.998 Durbin-Watson: 2.175
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.728
Skew: 0.253 Prob(JB): 0.00468
Kurtosis: 2.015 Cond. No. 1.46e+10

In [27]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<matplotlib.text.Text at 0x10ed2c978>

In [28]:
print("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[28]:
<matplotlib.text.Text at 0x10f027ef0>

In [29]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<matplotlib.text.Text at 0x10f33ed68>

In [30]:
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1a.methylation_difference # dependent variable to predict

X = pairs1a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[30]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.174
Model: OLS Adj. R-squared: 0.162
Method: Least Squares F-statistic: 14.46
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.39e-08
Time: 18:36:04 Log-Likelihood: 664.61
No. Observations: 210 AIC: -1321.
Df Residuals: 206 BIC: -1308.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -50.2686 11.460 -4.386 0.000 -72.862 -27.675
total_cpg_no_filter -5.383e-08 9.12e-09 -5.900 0.000 -7.18e-08 -3.58e-08
bsRate_mean 51.5942 11.885 4.341 0.000 28.162 75.026
avgReadCpG_mean -0.0031 0.029 -0.106 0.916 -0.060 0.054
Omnibus: 24.923 Durbin-Watson: 2.253
Prob(Omnibus): 0.000 Jarque-Bera (JB): 7.923
Skew: 0.135 Prob(JB): 0.0190
Kurtosis: 2.088 Cond. No. 1.68e+10

In [31]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[31]:
<matplotlib.text.Text at 0x10f5ab2b0>

In [32]:
print("methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[32]:
<matplotlib.text.Text at 0x10f7eb9b0>

In [33]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[33]:
<matplotlib.text.Text at 0x10faeac18>

In [ ]:


In [34]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2.methylation_difference # dependent variable to predict

X = pairs2.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[34]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.302
Model: OLS Adj. R-squared: 0.291
Method: Least Squares F-statistic: 26.82
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.84e-14
Time: 18:36:09 Log-Likelihood: 443.80
No. Observations: 190 AIC: -879.6
Df Residuals: 186 BIC: -866.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 29.9982 10.370 2.893 0.004 9.541 50.456
total_cpg_no_filter -8.72e-08 1.76e-08 -4.962 0.000 -1.22e-07 -5.25e-08
bsRate_mean -32.8258 10.856 -3.024 0.003 -54.242 -11.409
avgReadCpG_mean 0.2277 0.033 6.904 0.000 0.163 0.293
Omnibus: 8.958 Durbin-Watson: 1.474
Prob(Omnibus): 0.011 Jarque-Bera (JB): 12.260
Skew: 0.301 Prob(JB): 0.00218
Kurtosis: 4.089 Cond. No. 3.27e+09

In [35]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[35]:
<matplotlib.text.Text at 0x10fe46780>

In [36]:
print("methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[36]:
<matplotlib.text.Text at 0x11001d7b8>

In [37]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[37]:
<matplotlib.text.Text at 0x110274860>

In [38]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2a.methylation_difference # dependent variable to predict

X = pairs2a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[38]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.169
Model: OLS Adj. R-squared: 0.156
Method: Least Squares F-statistic: 12.63
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.50e-07
Time: 18:36:13 Log-Likelihood: 460.18
No. Observations: 190 AIC: -912.4
Df Residuals: 186 BIC: -899.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -40.9702 10.552 -3.883 0.000 -61.787 -20.154
total_cpg_no_filter -5.822e-08 1.3e-08 -4.465 0.000 -8.39e-08 -3.25e-08
bsRate_mean 42.6072 11.072 3.848 0.000 20.765 64.449
avgReadCpG_mean 0.0120 0.022 0.557 0.578 -0.031 0.055
Omnibus: 3.281 Durbin-Watson: 2.393
Prob(Omnibus): 0.194 Jarque-Bera (JB): 2.603
Skew: -0.160 Prob(JB): 0.272
Kurtosis: 2.524 Cond. No. 4.47e+09

In [39]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[39]:
<matplotlib.text.Text at 0x110501160>

In [40]:
print("methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[40]:
<matplotlib.text.Text at 0x1102ac2e8>

In [41]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[41]:
<matplotlib.text.Text at 0x1109756a0>

In [ ]:


In [ ]:


In [42]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2b.methylation_difference # dependent variable to predict

X = pairs2b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[42]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.097
Model: OLS Adj. R-squared: 0.083
Method: Least Squares F-statistic: 6.674
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000265
Time: 18:36:16 Log-Likelihood: 514.91
No. Observations: 190 AIC: -1022.
Df Residuals: 186 BIC: -1009.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -37.7684 9.986 -3.782 0.000 -57.469 -18.068
total_cpg_no_filter -4.613e-08 1.24e-08 -3.717 0.000 -7.06e-08 -2.16e-08
bsRate_mean 38.9209 10.353 3.759 0.000 18.497 59.345
avgReadCpG_mean 0.0417 0.034 1.237 0.218 -0.025 0.108
Omnibus: 7.301 Durbin-Watson: 1.539
Prob(Omnibus): 0.026 Jarque-Bera (JB): 6.896
Skew: 0.411 Prob(JB): 0.0318
Kurtosis: 2.557 Cond. No. 6.96e+09

In [43]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[43]:
<matplotlib.text.Text at 0x110cfd748>

In [44]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[44]:
<matplotlib.text.Text at 0x110f42240>

In [45]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[45]:
<matplotlib.text.Text at 0x11118b6a0>

In [ ]:


In [46]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3.methylation_difference # dependent variable to predict

X = pairs3.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[46]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.176
Model: OLS Adj. R-squared: 0.158
Method: Least Squares F-statistic: 9.422
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.10e-05
Time: 18:36:20 Log-Likelihood: 329.50
No. Observations: 136 AIC: -651.0
Df Residuals: 132 BIC: -639.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -80.5968 31.693 -2.543 0.012 -143.288 -17.905
total_cpg_no_filter 1.518e-07 3.97e-08 3.825 0.000 7.33e-08 2.3e-07
bsRate_mean 79.4942 31.773 2.502 0.014 16.644 142.344
avgReadCpG_mean 0.1876 0.041 4.554 0.000 0.106 0.269
Omnibus: 3.415 Durbin-Watson: 1.088
Prob(Omnibus): 0.181 Jarque-Bera (JB): 3.118
Skew: 0.293 Prob(JB): 0.210
Kurtosis: 2.545 Cond. No. 7.56e+09

In [47]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[47]:
<matplotlib.text.Text at 0x1113ab470>

In [48]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[48]:
<matplotlib.text.Text at 0x11160dcf8>

In [49]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[49]:
<matplotlib.text.Text at 0x1119e6470>

In [ ]:


In [50]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3a.methylation_difference # dependent variable to predict

X = pairs3a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[50]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.191
Model: OLS Adj. R-squared: 0.180
Method: Least Squares F-statistic: 17.89
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.87e-10
Time: 18:36:23 Log-Likelihood: 523.58
No. Observations: 231 AIC: -1039.
Df Residuals: 227 BIC: -1025.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -79.0053 42.215 -1.872 0.063 -162.188 4.177
total_cpg_no_filter 8.53e-08 3.24e-08 2.633 0.009 2.15e-08 1.49e-07
bsRate_mean 78.2656 42.287 1.851 0.065 -5.060 161.591
avgReadCpG_mean 0.1346 0.020 6.857 0.000 0.096 0.173
Omnibus: 8.090 Durbin-Watson: 1.492
Prob(Omnibus): 0.018 Jarque-Bera (JB): 8.393
Skew: 0.465 Prob(JB): 0.0151
Kurtosis: 2.921 Cond. No. 1.03e+10

In [51]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[51]:
<matplotlib.text.Text at 0x111d5c780>

In [52]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[52]:
<matplotlib.text.Text at 0x111f9be10>

In [53]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[53]:
<matplotlib.text.Text at 0x1121a5390>

In [ ]:


In [54]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3b.methylation_difference # dependent variable to predict

X = pairs3b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[54]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.422
Model: OLS Adj. R-squared: 0.368
Method: Least Squares F-statistic: 7.792
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000482
Time: 18:36:27 Log-Likelihood: 85.081
No. Observations: 36 AIC: -162.2
Df Residuals: 32 BIC: -155.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 371.9341 197.834 1.880 0.069 -31.040 774.909
total_cpg_no_filter -1.834e-08 2.82e-08 -0.650 0.520 -7.58e-08 3.91e-08
bsRate_mean -373.2763 197.958 -1.886 0.068 -776.504 29.952
avgReadCpG_mean 0.0663 0.063 1.045 0.304 -0.063 0.195
Omnibus: 4.895 Durbin-Watson: 1.943
Prob(Omnibus): 0.086 Jarque-Bera (JB): 3.450
Skew: -0.676 Prob(JB): 0.178
Kurtosis: 3.687 Cond. No. 3.29e+10

In [55]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[55]:
<matplotlib.text.Text at 0x1124ce470>

In [56]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[56]:
<matplotlib.text.Text at 0x1126fbd68>

In [57]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[57]:
<matplotlib.text.Text at 0x112ad05f8>

In [ ]:


In [ ]:


In [58]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3c.methylation_difference # dependent variable to predict

X = pairs3c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[58]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.047
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 3.045
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0301
Time: 18:36:30 Log-Likelihood: 524.85
No. Observations: 190 AIC: -1042.
Df Residuals: 186 BIC: -1029.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 62.9080 23.818 2.641 0.009 15.920 109.896
total_cpg_no_filter 3.265e-08 1.96e-08 1.667 0.097 -6e-09 7.13e-08
bsRate_mean -63.0209 23.812 -2.647 0.009 -109.998 -16.044
avgReadCpG_mean -0.0074 0.026 -0.290 0.772 -0.058 0.043
Omnibus: 13.771 Durbin-Watson: 1.488
Prob(Omnibus): 0.001 Jarque-Bera (JB): 15.195
Skew: 0.693 Prob(JB): 0.000502
Kurtosis: 3.023 Cond. No. 8.48e+09

In [59]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[59]:
<matplotlib.text.Text at 0x112d44c18>

In [60]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[60]:
<matplotlib.text.Text at 0x113053f28>

In [61]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[61]:
<matplotlib.text.Text at 0x1132cee80>

In [ ]:


In [ ]:


In [62]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4.methylation_difference # dependent variable to predict

X = pairs4.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[62]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.475
Model: OLS Adj. R-squared: 0.464
Method: Least Squares F-statistic: 44.88
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.01e-20
Time: 18:36:34 Log-Likelihood: 358.78
No. Observations: 153 AIC: -709.6
Df Residuals: 149 BIC: -697.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 31.8745 63.262 0.504 0.615 -93.132 156.881
total_cpg_no_filter 2.164e-07 3.49e-08 6.208 0.000 1.48e-07 2.85e-07
bsRate_mean -34.4878 63.575 -0.542 0.588 -160.112 91.137
avgReadCpG_mean 0.3535 0.037 9.641 0.000 0.281 0.426
Omnibus: 1.604 Durbin-Watson: 1.420
Prob(Omnibus): 0.448 Jarque-Bera (JB): 1.661
Skew: 0.201 Prob(JB): 0.436
Kurtosis: 2.685 Cond. No. 1.52e+10

In [63]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[63]:
<matplotlib.text.Text at 0x113502f60>

In [64]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[64]:
<matplotlib.text.Text at 0x1137618d0>

In [65]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[65]:
<matplotlib.text.Text at 0x113a86dd8>

In [ ]:


In [66]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4a.methylation_difference # dependent variable to predict

X = pairs4a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[66]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.171
Model: OLS Adj. R-squared: 0.154
Method: Least Squares F-statistic: 10.21
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.72e-06
Time: 18:36:37 Log-Likelihood: 406.36
No. Observations: 153 AIC: -804.7
Df Residuals: 149 BIC: -792.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -124.8072 52.266 -2.388 0.018 -228.086 -21.529
total_cpg_no_filter 1.656e-07 3.5e-08 4.733 0.000 9.65e-08 2.35e-07
bsRate_mean 124.5439 52.412 2.376 0.019 20.978 228.110
avgReadCpG_mean 0.0759 0.035 2.139 0.034 0.006 0.146
Omnibus: 6.175 Durbin-Watson: 1.448
Prob(Omnibus): 0.046 Jarque-Bera (JB): 6.310
Skew: 0.494 Prob(JB): 0.0426
Kurtosis: 2.888 Cond. No. 1.55e+10

In [67]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[67]:
<matplotlib.text.Text at 0x113dd79e8>

In [68]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[68]:
<matplotlib.text.Text at 0x113f49048>

In [69]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[69]:
<matplotlib.text.Text at 0x114276f28>

In [ ]:


In [70]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4b.methylation_difference # dependent variable to predict

X = pairs4b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[70]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.336
Model: OLS Adj. R-squared: 0.321
Method: Least Squares F-statistic: 22.30
Date: Tue, 09 Aug 2016 Prob (F-statistic): 9.63e-12
Time: 18:36:41 Log-Likelihood: 309.52
No. Observations: 136 AIC: -611.0
Df Residuals: 132 BIC: -599.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.4083 66.568 0.006 0.995 -131.270 132.086
total_cpg_no_filter 3.337e-07 5.76e-08 5.794 0.000 2.2e-07 4.48e-07
bsRate_mean -2.8663 66.672 -0.043 0.966 -134.749 129.017
avgReadCpG_mean 0.3437 0.043 7.960 0.000 0.258 0.429
Omnibus: 3.749 Durbin-Watson: 1.332
Prob(Omnibus): 0.153 Jarque-Bera (JB): 3.798
Skew: 0.389 Prob(JB): 0.150
Kurtosis: 2.745 Cond. No. 1.28e+10

In [71]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[71]:
<matplotlib.text.Text at 0x1145cbd68>

In [72]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[72]:
<matplotlib.text.Text at 0x11389bbe0>

In [73]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[73]:
<matplotlib.text.Text at 0x114ab2320>

In [ ]:


In [74]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4c.methylation_difference # dependent variable to predict

X = pairs4c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[74]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.163
Model: OLS Adj. R-squared: 0.151
Method: Least Squares F-statistic: 13.41
Date: Tue, 09 Aug 2016 Prob (F-statistic): 5.01e-08
Time: 18:36:45 Log-Likelihood: 462.71
No. Observations: 210 AIC: -917.4
Df Residuals: 206 BIC: -904.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -273.3499 119.265 -2.292 0.023 -508.486 -38.213
total_cpg_no_filter 1.556e-07 4.42e-08 3.522 0.001 6.85e-08 2.43e-07
bsRate_mean 272.5222 119.451 2.281 0.024 37.019 508.025
avgReadCpG_mean 0.2106 0.036 5.914 0.000 0.140 0.281
Omnibus: 3.871 Durbin-Watson: 2.144
Prob(Omnibus): 0.144 Jarque-Bera (JB): 3.833
Skew: 0.290 Prob(JB): 0.147
Kurtosis: 2.682 Cond. No. 3.13e+10

In [75]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[75]:
<matplotlib.text.Text at 0x114e1eb70>

In [76]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[76]:
<matplotlib.text.Text at 0x114ff6cf8>

In [77]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[77]:
<matplotlib.text.Text at 0x11523ecf8>

In [ ]:


In [78]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5.methylation_difference # dependent variable to predict

X = pairs5.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[78]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.076
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 5.645
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000977
Time: 18:36:49 Log-Likelihood: 457.19
No. Observations: 210 AIC: -906.4
Df Residuals: 206 BIC: -893.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -8.5868 45.516 -0.189 0.851 -98.324 81.151
total_cpg_no_filter -3.285e-08 1.14e-08 -2.890 0.004 -5.53e-08 -1.04e-08
bsRate_mean 8.0917 45.637 0.177 0.859 -81.885 98.068
avgReadCpG_mean 0.1083 0.031 3.511 0.001 0.047 0.169
Omnibus: 20.949 Durbin-Watson: 1.419
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.161
Skew: 0.811 Prob(JB): 5.67e-06
Kurtosis: 3.365 Cond. No. 1.48e+10

In [79]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[79]:
<matplotlib.text.Text at 0x1155b79b0>

In [80]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[80]:
<matplotlib.text.Text at 0x1157df240>

In [81]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[81]:
<matplotlib.text.Text at 0x115a24d68>

In [ ]:


In [82]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5a.methylation_difference # dependent variable to predict

X = pairs5a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[82]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.141
Model: OLS Adj. R-squared: 0.127
Method: Least Squares F-statistic: 10.17
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.11e-06
Time: 18:36:52 Log-Likelihood: 443.89
No. Observations: 190 AIC: -879.8
Df Residuals: 186 BIC: -866.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -49.0700 26.378 -1.860 0.064 -101.108 2.968
total_cpg_no_filter -3.547e-08 7.72e-09 -4.595 0.000 -5.07e-08 -2.02e-08
bsRate_mean 48.8384 26.484 1.844 0.067 -3.409 101.086
avgReadCpG_mean 0.0786 0.029 2.753 0.006 0.022 0.135
Omnibus: 15.542 Durbin-Watson: 1.021
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.858
Skew: 0.702 Prob(JB): 0.000218
Kurtosis: 3.401 Cond. No. 1.63e+10

In [83]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[83]:
<matplotlib.text.Text at 0x115e6d828>

In [84]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[84]:
<matplotlib.text.Text at 0x115f44f28>

In [85]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[85]:
<matplotlib.text.Text at 0x116274470>

In [ ]:


In [86]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5b.methylation_difference # dependent variable to predict

X = pairs5b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[86]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.136
Model: OLS Adj. R-squared: 0.125
Method: Least Squares F-statistic: 11.91
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.84e-07
Time: 18:36:57 Log-Likelihood: 562.90
No. Observations: 231 AIC: -1118.
Df Residuals: 227 BIC: -1104.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 177.4453 55.841 3.178 0.002 67.413 287.478
total_cpg_no_filter -6.119e-09 8.4e-09 -0.728 0.467 -2.27e-08 1.04e-08
bsRate_mean -177.0063 55.939 -3.164 0.002 -287.232 -66.780
avgReadCpG_mean -0.1357 0.032 -4.237 0.000 -0.199 -0.073
Omnibus: 12.686 Durbin-Watson: 1.478
Prob(Omnibus): 0.002 Jarque-Bera (JB): 13.548
Skew: 0.592 Prob(JB): 0.00114
Kurtosis: 3.068 Cond. No. 2.98e+10

In [87]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[87]:
<matplotlib.text.Text at 0x1164b4be0>

In [88]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[88]:
<matplotlib.text.Text at 0x116754128>

In [89]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[89]:
<matplotlib.text.Text at 0x116b353c8>

In [ ]:


In [90]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5c.methylation_difference # dependent variable to predict

X = pairs5c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[90]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.038
Model: OLS Adj. R-squared: 0.023
Method: Least Squares F-statistic: 2.458
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0643
Time: 18:37:00 Log-Likelihood: 432.68
No. Observations: 190 AIC: -857.4
Df Residuals: 186 BIC: -844.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 71.2926 50.936 1.400 0.163 -29.193 171.779
total_cpg_no_filter -6.118e-09 9.79e-09 -0.625 0.533 -2.54e-08 1.32e-08
bsRate_mean -71.7353 51.060 -1.405 0.162 -172.467 28.996
avgReadCpG_mean 0.0637 0.025 2.537 0.012 0.014 0.113
Omnibus: 9.227 Durbin-Watson: 1.561
Prob(Omnibus): 0.010 Jarque-Bera (JB): 9.799
Skew: 0.552 Prob(JB): 0.00745
Kurtosis: 2.857 Cond. No. 1.62e+10

In [91]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[91]:
<matplotlib.text.Text at 0x116e91be0>

In [92]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[92]:
<matplotlib.text.Text at 0x1170ec550>

In [93]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[93]:
<matplotlib.text.Text at 0x1174d96d8>

In [ ]:


In [94]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6.methylation_difference # dependent variable to predict

X = pairs6.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[94]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.076
Model: OLS Adj. R-squared: 0.059
Method: Least Squares F-statistic: 4.560
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00425
Time: 18:37:03 Log-Likelihood: 476.34
No. Observations: 171 AIC: -944.7
Df Residuals: 167 BIC: -932.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 6.9159 4.154 1.665 0.098 -1.286 15.117
total_cpg_no_filter 1.243e-07 4.48e-08 2.774 0.006 3.58e-08 2.13e-07
bsRate_mean -6.9159 4.329 -1.598 0.112 -15.462 1.630
avgReadCpG_mean -0.0419 0.026 -1.628 0.105 -0.093 0.009
Omnibus: 13.706 Durbin-Watson: 1.123
Prob(Omnibus): 0.001 Jarque-Bera (JB): 15.393
Skew: 0.734 Prob(JB): 0.000454
Kurtosis: 2.930 Cond. No. 9.65e+08

In [95]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[95]:
<matplotlib.text.Text at 0x11783ef98>

In [96]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[96]:
<matplotlib.text.Text at 0x117a7c8d0>

In [97]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[97]:
<matplotlib.text.Text at 0x1173dd2b0>

In [ ]:


In [98]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, methylation_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6a.methylation_difference # dependent variable to predict

X = pairs6a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[98]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.006
Model: OLS Adj. R-squared: -0.011
Method: Least Squares F-statistic: 0.3451
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.793
Time: 18:37:07 Log-Likelihood: 409.77
No. Observations: 190 AIC: -811.5
Df Residuals: 186 BIC: -798.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -9.3140 17.013 -0.547 0.585 -42.878 24.250
total_cpg_no_filter 1.368e-08 2.33e-08 0.587 0.558 -3.23e-08 5.97e-08
bsRate_mean 9.8276 17.870 0.550 0.583 -25.426 45.081
avgReadCpG_mean -0.0135 0.040 -0.336 0.737 -0.093 0.066
Omnibus: 11.696 Durbin-Watson: 1.748
Prob(Omnibus): 0.003 Jarque-Bera (JB): 12.109
Skew: 0.583 Prob(JB): 0.00235
Kurtosis: 2.590 Cond. No. 4.47e+09

In [99]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[99]:
<matplotlib.text.Text at 0x1180722b0>

In [100]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[100]:
<matplotlib.text.Text at 0x11836f6d8>

In [101]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[101]:
<matplotlib.text.Text at 0x1185a50f0>

In [ ]:


In [102]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, methylation_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6b.methylation_difference # dependent variable to predict

X = pairs6b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[102]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.015
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.8661
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.460
Time: 18:37:11 Log-Likelihood: 369.55
No. Observations: 171 AIC: -731.1
Df Residuals: 167 BIC: -718.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -2.0674 16.912 -0.122 0.903 -35.457 31.322
total_cpg_no_filter -2.583e-08 1.9e-08 -1.359 0.176 -6.34e-08 1.17e-08
bsRate_mean 2.3154 17.679 0.131 0.896 -32.588 37.219
avgReadCpG_mean -0.0147 0.056 -0.263 0.793 -0.125 0.096
Omnibus: 12.063 Durbin-Watson: 1.344
Prob(Omnibus): 0.002 Jarque-Bera (JB): 13.236
Skew: 0.666 Prob(JB): 0.00134
Kurtosis: 2.713 Cond. No. 7.17e+09

In [103]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[103]:
<matplotlib.text.Text at 0x1188038d0>

In [104]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[104]:
<matplotlib.text.Text at 0x118a5ecc0>

In [105]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[105]:
<matplotlib.text.Text at 0x118e1f400>

In [ ]:


In [106]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, methylation_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6c.methylation_difference # dependent variable to predict

X = pairs6c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[106]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.009
Model: OLS Adj. R-squared: -0.009
Method: Least Squares F-statistic: 0.4985
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.684
Time: 18:37:14 Log-Likelihood: 400.84
No. Observations: 171 AIC: -793.7
Df Residuals: 167 BIC: -781.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 8.4566 10.614 0.797 0.427 -12.498 29.411
total_cpg_no_filter -1.428e-09 1.37e-08 -0.104 0.917 -2.85e-08 2.56e-08
bsRate_mean -8.6228 11.136 -0.774 0.440 -30.609 13.363
avgReadCpG_mean -0.0182 0.042 -0.436 0.663 -0.101 0.064
Omnibus: 11.347 Durbin-Watson: 1.718
Prob(Omnibus): 0.003 Jarque-Bera (JB): 12.434
Skew: 0.653 Prob(JB): 0.00200
Kurtosis: 2.802 Cond. No. 5.35e+09

In [107]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[107]:
<matplotlib.text.Text at 0x119076438>

In [108]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[108]:
<matplotlib.text.Text at 0x1192d20b8>

In [109]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[109]:
<matplotlib.text.Text at 0x1195e0a58>

In [ ]:


In [110]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, methylation_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6d.methylation_difference # dependent variable to predict

X = pairs6d.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[110]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.098
Model: OLS Adj. R-squared: 0.082
Method: Least Squares F-statistic: 6.041
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000626
Time: 18:37:17 Log-Likelihood: 369.17
No. Observations: 171 AIC: -730.3
Df Residuals: 167 BIC: -717.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 21.8211 12.056 1.810 0.072 -1.981 45.624
total_cpg_no_filter 4.343e-08 2.49e-08 1.742 0.083 -5.8e-09 9.27e-08
bsRate_mean -24.4352 12.682 -1.927 0.056 -49.472 0.602
avgReadCpG_mean 0.2367 0.057 4.184 0.000 0.125 0.348
Omnibus: 9.120 Durbin-Watson: 1.943
Prob(Omnibus): 0.010 Jarque-Bera (JB): 9.711
Skew: 0.571 Prob(JB): 0.00778
Kurtosis: 2.761 Cond. No. 4.19e+09

In [111]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[111]:
<matplotlib.text.Text at 0x119912518>

In [112]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[112]:
<matplotlib.text.Text at 0x119b5d6d8>

In [113]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[113]:
<matplotlib.text.Text at 0x119e64cf8>

In [ ]:


In [ ]:


In [ ]:


In [114]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, methylation_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6e.methylation_difference # dependent variable to predict

X = pairs6e.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[114]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.090
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 3.315
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0230
Time: 18:37:21 Log-Likelihood: 217.59
No. Observations: 105 AIC: -427.2
Df Residuals: 101 BIC: -416.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 10.1334 27.564 0.368 0.714 -44.547 64.814
total_cpg_no_filter -5.553e-08 4.02e-08 -1.382 0.170 -1.35e-07 2.42e-08
bsRate_mean -7.9851 28.418 -0.281 0.779 -64.358 48.388
avgReadCpG_mean -0.3432 0.113 -3.029 0.003 -0.568 -0.118
Omnibus: 8.251 Durbin-Watson: 1.945
Prob(Omnibus): 0.016 Jarque-Bera (JB): 8.349
Skew: 0.689 Prob(JB): 0.0154
Kurtosis: 3.107 Cond. No. 6.03e+09

In [115]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[115]:
<matplotlib.text.Text at 0x118a59630>

In [116]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[116]:
<matplotlib.text.Text at 0x11a27ffd0>

In [117]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[117]:
<matplotlib.text.Text at 0x11a4c7748>

In [118]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)


(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)

In [119]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

In [120]:
total_pairs = pd.concat(frames)

In [121]:
total_pairs.shape


Out[121]:
(4035, 15)

In [122]:
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

In [123]:
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
Out[123]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.043
Model: OLS Adj. R-squared: 0.042
Method: Least Squares F-statistic: 44.80
Date: Tue, 09 Aug 2016 Prob (F-statistic): 7.35e-37
Time: 18:37:24 Log-Likelihood: 9099.1
No. Observations: 4035 AIC: -1.819e+04
Df Residuals: 4030 BIC: -1.816e+04
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.1532 0.031 4.988 0.000 0.093 0.213
total_cpg_no_filter -1.171e-08 2.1e-09 -5.566 0.000 -1.58e-08 -7.59e-09
bsRate_mean -0.1167 0.029 -4.061 0.000 -0.173 -0.060
avgReadCpG_mean 0.0002 0.001 0.341 0.733 -0.001 0.002
type_CLL -0.0106 0.001 -9.594 0.000 -0.013 -0.008
Omnibus: 578.877 Durbin-Watson: 1.457
Prob(Omnibus): 0.000 Jarque-Bera (JB): 883.294
Skew: 1.021 Prob(JB): 1.57e-192
Kurtosis: 4.043 Cond. No. 5.10e+07

In [124]:
X.head()


Out[124]:
const total_cpg_no_filter bsRate_mean avgReadCpG_mean type_CLL
0 1 709541.5 0.980181 7.029760 1.0
1 1 786284.0 0.980293 7.017405 1.0
2 1 683623.5 0.980268 7.039100 1.0
3 1 853200.5 0.980262 7.028117 1.0
4 1 842686.5 0.980228 7.025451 1.0

In [125]:
print("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")


methylation_difference vs. bsRate, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[125]:
<matplotlib.text.Text at 0x11a756358>

In [126]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[126]:
<matplotlib.text.Text at 0x11aab3198>

In [127]:
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[127]:
<matplotlib.text.Text at 0x11add4898>

In [128]:
print("methylation_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. bsRate, by type, CLL vs normal")


methylation_difference vs. bsRate, by type! 
Out[128]:
<matplotlib.text.Text at 0x11b0e6e48>

In [129]:
print("methylation_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")


methylation_difference vs. avgReadCpG per cell, by type! 
Out[129]:
<matplotlib.text.Text at 0x11b21dba8>

In [130]:
print("methylation_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. total unique CpG per cell, by type, CLL vs Normal ")


methylation_difference vs. total unique CpG per cel, by type!
Out[130]:
<matplotlib.text.Text at 0x11b35deb8>

In [131]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.895730689281

In [132]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)


(990, 15)
(3045, 15)

In [133]:
#
# CLL first
#
y = cll_pairs.methylation_difference # dependent variable
X = cll_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(990, 4)
Out[133]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.201
Model: OLS Adj. R-squared: 0.198
Method: Least Squares F-statistic: 82.59
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.14e-47
Time: 18:40:03 Log-Likelihood: 2540.5
No. Observations: 990 AIC: -5073.
Df Residuals: 986 BIC: -5053.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -0.1089 0.125 -0.871 0.384 -0.354 0.137
total_cpg_no_filter -4.165e-08 5.02e-09 -8.292 0.000 -5.15e-08 -3.18e-08
bsRate_mean -0.1666 0.114 -1.465 0.143 -0.390 0.057
avgReadCpG_mean 0.0450 0.008 5.576 0.000 0.029 0.061
Omnibus: 76.465 Durbin-Watson: 1.679
Prob(Omnibus): 0.000 Jarque-Bera (JB): 98.596
Skew: 0.659 Prob(JB): 3.89e-22
Kurtosis: 3.810 Cond. No. 1.66e+08

In [134]:
print("methylation_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  CLL pairs ")


methylation_difference vs. bsRate, jointplot, CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[134]:
<matplotlib.text.Text at 0x161c30278>

In [135]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[135]:
<matplotlib.text.Text at 0x22bc59c18>

In [136]:
print("methylation diference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell,  CLL pairs")


methylation diference vs. total # unique CpGs per cell, CLL pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[136]:
<matplotlib.text.Text at 0x22dbf50b8>

In [137]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.911536366763

In [ ]:


In [138]:
#
# normal
#
y = normal_pairs.methylation_difference # dependent variable
X = normal_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(3045, 4)
Out[138]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.003
Model: OLS Adj. R-squared: 0.002
Method: Least Squares F-statistic: 2.827
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0372
Time: 18:40:45 Log-Likelihood: 6693.1
No. Observations: 3045 AIC: -1.338e+04
Df Residuals: 3041 BIC: -1.335e+04
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.0540 0.035 1.530 0.126 -0.015 0.123
total_cpg_no_filter -7.466e-10 2.61e-09 -0.286 0.775 -5.87e-09 4.37e-09
bsRate_mean -0.0290 0.033 -0.885 0.376 -0.093 0.035
avgReadCpG_mean 0.0015 0.001 1.979 0.048 1.42e-05 0.003
Omnibus: 404.716 Durbin-Watson: 1.467
Prob(Omnibus): 0.000 Jarque-Bera (JB): 585.076
Skew: 0.997 Prob(JB): 8.96e-128
Kurtosis: 3.798 Cond. No. 4.39e+07

In [139]:
print("methylation_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal pairs ")


methylation_difference vs. bsRate, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[139]:
<matplotlib.text.Text at 0x11bb8e048>

In [140]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[140]:
<matplotlib.text.Text at 0x11c0f2c50>

In [141]:
print("methylation diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, Normal pairs")


methylation diference vs. total # unique CpGs per cell, Normal pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[141]:
<matplotlib.text.Text at 0x11c41b080>

In [142]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.888721635344

In [145]:
X


Out[145]:
const total_cpg_no_filter bsRate_mean avgReadCpG_mean
0 1 416763.5 0.997315 6.891204
1 1 309204.0 0.997480 6.972952
2 1 339687.5 0.997430 6.847710
3 1 408786.5 0.997382 6.851644
4 1 384508.5 0.997391 6.858078
5 1 396657.0 0.997365 6.858142
6 1 338167.0 0.997429 6.851672
7 1 344211.5 0.997454 6.881279
8 1 425468.5 0.997407 6.827551
9 1 342130.5 0.997368 6.860331
10 1 321270.0 0.997328 6.905727
11 1 282875.5 0.997340 6.879531
12 1 294697.0 0.997364 6.978425
13 1 360588.5 0.997415 6.895013
14 1 380398.0 0.997408 6.905550
15 1 301601.5 0.997372 6.937971
16 1 322817.5 0.997380 7.015534
17 1 353301.0 0.997330 6.890292
18 1 422400.0 0.997282 6.894226
19 1 398122.0 0.997291 6.900660
20 1 410270.5 0.997265 6.900724
21 1 351780.5 0.997329 6.894254
22 1 357825.0 0.997354 6.923861
23 1 439082.0 0.997307 6.870133
24 1 355744.0 0.997268 6.902913
25 1 334883.5 0.997228 6.948309
26 1 296489.0 0.997240 6.922113
27 1 308310.5 0.997264 7.021007
28 1 374202.0 0.997315 6.937595
29 1 394011.5 0.997308 6.948132
... ... ... ... ...
75 1 535320.0 0.961082 6.990924
76 1 497376.5 0.960889 6.977585
77 1 507416.5 0.961103 6.972817
78 1 413178.5 0.961120 6.937515
79 1 296916.5 0.960838 7.015553
80 1 382519.0 0.961078 6.983289
81 1 390752.0 0.961159 6.967151
82 1 423436.0 0.961255 6.984978
83 1 385492.5 0.961062 6.971639
84 1 544249.0 0.961015 6.936375
85 1 427987.0 0.960732 7.014413
86 1 513589.5 0.960973 6.982149
87 1 521822.5 0.961054 6.966011
88 1 554506.5 0.961150 6.983837
89 1 516563.0 0.960956 6.970499
90 1 333749.0 0.960749 6.979111
91 1 419351.5 0.960990 6.946847
92 1 427584.5 0.961071 6.930709
93 1 460268.5 0.961167 6.948535
94 1 422325.0 0.960974 6.935197
95 1 303089.5 0.960707 7.024885
96 1 311322.5 0.960788 7.008748
97 1 344006.5 0.960884 7.026574
98 1 306063.0 0.960691 7.013236
99 1 396925.0 0.961029 6.976484
100 1 429609.0 0.961125 6.994310
101 1 391665.5 0.960932 6.980971
102 1 437842.0 0.961206 6.978172
103 1 399898.5 0.961013 6.964834
104 1 432582.5 0.961109 6.982660

3045 rows × 4 columns


In [ ]:


In [ ]: