In [1]:
%matplotlib inline

In [2]:
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#

In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm

In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")

In [5]:
stats.shape


Out[5]:
(438, 15)

In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [7]:
len(normal)


Out[7]:
336

In [8]:
len(CLL)


Out[8]:
102

In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")

In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)

In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])

In [12]:
cpg_total.shape


Out[12]:
(513, 4)

In [13]:
merged = stats.merge(cpg_total, on="filename")

In [14]:
merged.shape


Out[14]:
(438, 18)

In [15]:
merged.head()


Out[15]:
filename methylation PDR_total methylation_unweighted PDR_unweighted thisMeth mixedReadCount total_reads type bio protocol total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 bsRate avgReadCpgs_nofilter avgReadCpgs_lessthan1CpG avgReadCpgs_gtreql3.8CpG
0 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC 0.529505 0.235795 0.632802 0.231878 2208325.0 983394.0 4170549.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 525282.0 525251.0 435636.0 0.9975 5.354284 5.355660 7.019255
1 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG 0.455550 0.177631 0.583859 0.175371 733064.0 285841.0 1609185.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 221972.0 221962.0 186757.0 0.9975 5.587294 5.588449 7.302612
2 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG 0.515269 0.177645 0.618578 0.174221 1452802.0 500870.0 2819500.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 355730.0 355713.0 295624.0 0.9975 5.393199 5.394331 7.079288
3 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG 0.556175 0.176367 0.652727 0.172273 2279354.0 722800.0 4098270.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 483179.0 483150.0 397812.0 0.9975 5.287116 5.288477 6.979525
4 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC 0.528642 0.181331 0.640401 0.172287 1394208.0 478231.0 2637340.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 356122.0 356100.0 294065.0 0.9975 5.314302 5.315719 6.995052

In [16]:
sns.lmplot(x="bsRate", y="methylation",  data=merged)
plt.title("methylation (weighted) vs bisulfite conversion rate")


Out[16]:
<matplotlib.text.Text at 0x110e9f320>

In [17]:
sns.lmplot(x="bsRate", y="methylation_unweighted",  data=merged)
plt.title("methylation (unweighted) vs bisulfite conversion rate")


Out[17]:
<matplotlib.text.Text at 0x110f68198>

In [18]:
sns.lmplot(x="bsRate", y="PDR_total",  data=merged)
plt.title("PDR (weighted) vs bisulfite conversion rate")


Out[18]:
<matplotlib.text.Text at 0x111a06e80>

In [19]:
sns.lmplot(x="bsRate", y="PDR_unweighted",  data=merged)
plt.title("PDR (unweighted) vs bisulfite conversion rate")


Out[19]:
<matplotlib.text.Text at 0x111b10b00>

In [20]:
sns.lmplot(x="bsRate", y="total_cpg_no_filter",  data=merged)
plt.title("Total # unique CpGs per cell (no filter) vs bisulfite conversion rate")


Out[20]:
<matplotlib.text.Text at 0x111c19908>

In [21]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan1",  data=merged)
plt.title("Total # unique CpGs per cell (filter: > 1) vs bisulfite conversion rate")


Out[21]:
<matplotlib.text.Text at 0x111d1fc88>

In [22]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan38",  data=merged)
plt.title("Total # unique CpGs per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[22]:
<matplotlib.text.Text at 0x111e257f0>

In [23]:
sns.lmplot(x="bsRate", y="avgReadCpgs_nofilter",  data=merged)
plt.title("Mean avgCpG read per cell (no filter) vs bisulfite conversion rate")


Out[23]:
<matplotlib.text.Text at 0x111fb4a20>

In [24]:
sns.lmplot(x="bsRate", y="avgReadCpgs_lessthan1CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: > 1) vs bisulfite conversion rate")


Out[24]:
<matplotlib.text.Text at 0x1120b6940>

In [25]:
sns.lmplot(x="bsRate", y="avgReadCpgs_gtreql3.8CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[25]:
<matplotlib.text.Text at 0x11223f2e8>

In [ ]:


In [ ]:


In [26]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1.methylation_difference # dependent variable to predict

X = pairs1.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[26]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.058
Model: OLS Adj. R-squared: 0.044
Method: Least Squares F-statistic: 4.225
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00633
Time: 17:36:01 Log-Likelihood: 635.49
No. Observations: 210 AIC: -1263.
Df Residuals: 206 BIC: -1250.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -29.7869 11.327 -2.630 0.009 -52.119 -7.455
total_cpg_no_filter -4.194e-08 1.27e-08 -3.314 0.001 -6.69e-08 -1.7e-08
bsRate_mean 30.0727 11.557 2.602 0.010 7.288 52.858
avgReadCpG_mean 0.0665 0.030 2.179 0.030 0.006 0.127
Omnibus: 36.210 Durbin-Watson: 2.186
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.249
Skew: 0.219 Prob(JB): 0.00595
Kurtosis: 2.010 Cond. No. 1.46e+10

In [27]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<matplotlib.text.Text at 0x112448a20>

In [28]:
print("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[28]:
<matplotlib.text.Text at 0x112749048>

In [29]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<matplotlib.text.Text at 0x112a68208>

In [30]:
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs1a.methylation_difference # dependent variable to predict

X = pairs1a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[30]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.174
Model: OLS Adj. R-squared: 0.162
Method: Least Squares F-statistic: 14.50
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.32e-08
Time: 17:36:04 Log-Likelihood: 664.66
No. Observations: 210 AIC: -1321.
Df Residuals: 206 BIC: -1308.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -45.7405 13.998 -3.268 0.001 -73.339 -18.142
total_cpg_no_filter -5.292e-08 9.32e-09 -5.678 0.000 -7.13e-08 -3.45e-08
bsRate_mean 46.8692 14.491 3.234 0.001 18.300 75.439
avgReadCpG_mean 0.0110 0.032 0.342 0.733 -0.053 0.075
Omnibus: 25.435 Durbin-Watson: 2.256
Prob(Omnibus): 0.000 Jarque-Bera (JB): 8.013
Skew: 0.137 Prob(JB): 0.0182
Kurtosis: 2.083 Cond. No. 2.05e+10

In [31]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[31]:
<matplotlib.text.Text at 0x112ccc320>

In [32]:
print("methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[32]:
<matplotlib.text.Text at 0x112f0cdd8>

In [33]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[33]:
<matplotlib.text.Text at 0x11320f978>

In [ ]:


In [34]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2.methylation_difference # dependent variable to predict

X = pairs2.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[34]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.314
Model: OLS Adj. R-squared: 0.303
Method: Least Squares F-statistic: 28.37
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.75e-15
Time: 17:36:07 Log-Likelihood: 445.44
No. Observations: 190 AIC: -882.9
Df Residuals: 186 BIC: -869.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 37.2361 10.555 3.528 0.001 16.413 58.059
total_cpg_no_filter -8.362e-08 1.75e-08 -4.789 0.000 -1.18e-07 -4.92e-08
bsRate_mean -39.7882 11.037 -3.605 0.000 -61.561 -18.015
avgReadCpG_mean 0.1967 0.027 7.193 0.000 0.143 0.251
Omnibus: 8.412 Durbin-Watson: 1.487
Prob(Omnibus): 0.015 Jarque-Bera (JB): 12.206
Skew: 0.252 Prob(JB): 0.00224
Kurtosis: 4.135 Cond. No. 3.35e+09

In [35]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[35]:
<matplotlib.text.Text at 0x11356e518>

In [36]:
print("methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[36]:
<matplotlib.text.Text at 0x113749748>

In [37]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[37]:
<matplotlib.text.Text at 0x113990ac8>

In [38]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2a.methylation_difference # dependent variable to predict

X = pairs2a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[38]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.172
Model: OLS Adj. R-squared: 0.158
Method: Least Squares F-statistic: 12.84
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.15e-07
Time: 17:36:10 Log-Likelihood: 460.45
No. Observations: 190 AIC: -912.9
Df Residuals: 186 BIC: -899.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -37.9421 11.108 -3.416 0.001 -59.855 -16.029
total_cpg_no_filter -6.168e-08 1.39e-08 -4.451 0.000 -8.9e-08 -3.43e-08
bsRate_mean 39.4395 11.631 3.391 0.001 16.494 62.385
avgReadCpG_mean 0.0190 0.021 0.918 0.360 -0.022 0.060
Omnibus: 2.785 Durbin-Watson: 2.411
Prob(Omnibus): 0.248 Jarque-Bera (JB): 2.427
Skew: -0.177 Prob(JB): 0.297
Kurtosis: 2.574 Cond. No. 4.71e+09

In [39]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[39]:
<matplotlib.text.Text at 0x113c274a8>

In [40]:
print("methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[40]:
<matplotlib.text.Text at 0x1139c76a0>

In [41]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[41]:
<matplotlib.text.Text at 0x114090978>

In [ ]:


In [ ]:


In [42]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs2b.methylation_difference # dependent variable to predict

X = pairs2b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[42]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.108
Model: OLS Adj. R-squared: 0.093
Method: Least Squares F-statistic: 7.468
Date: Tue, 09 Aug 2016 Prob (F-statistic): 9.52e-05
Time: 17:36:13 Log-Likelihood: 516.00
No. Observations: 190 AIC: -1024.
Df Residuals: 186 BIC: -1011.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -36.1770 9.934 -3.642 0.000 -55.774 -16.580
total_cpg_no_filter -5.23e-08 1.25e-08 -4.189 0.000 -7.69e-08 -2.77e-08
bsRate_mean 37.2190 10.318 3.607 0.000 16.863 57.575
avgReadCpG_mean 0.0647 0.034 1.923 0.056 -0.002 0.131
Omnibus: 7.871 Durbin-Watson: 1.582
Prob(Omnibus): 0.020 Jarque-Bera (JB): 6.809
Skew: 0.384 Prob(JB): 0.0332
Kurtosis: 2.481 Cond. No. 6.97e+09

In [43]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[43]:
<matplotlib.text.Text at 0x114429cc0>

In [44]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[44]:
<matplotlib.text.Text at 0x11465c780>

In [45]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[45]:
<matplotlib.text.Text at 0x1148abf28>

In [ ]:


In [46]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3.methylation_difference # dependent variable to predict

X = pairs3.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[46]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.181
Model: OLS Adj. R-squared: 0.162
Method: Least Squares F-statistic: 9.691
Date: Tue, 09 Aug 2016 Prob (F-statistic): 7.96e-06
Time: 17:36:16 Log-Likelihood: 329.85
No. Observations: 136 AIC: -651.7
Df Residuals: 132 BIC: -640.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -110.7678 32.376 -3.421 0.001 -174.811 -46.725
total_cpg_no_filter 1.626e-07 4.08e-08 3.983 0.000 8.18e-08 2.43e-07
bsRate_mean 109.8557 32.405 3.390 0.001 45.755 173.956
avgReadCpG_mean 0.2245 0.048 4.638 0.000 0.129 0.320
Omnibus: 4.465 Durbin-Watson: 1.138
Prob(Omnibus): 0.107 Jarque-Bera (JB): 3.844
Skew: 0.321 Prob(JB): 0.146
Kurtosis: 2.485 Cond. No. 7.73e+09

In [47]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[47]:
<matplotlib.text.Text at 0x114aa6d30>

In [48]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[48]:
<matplotlib.text.Text at 0x114d315f8>

In [49]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[49]:
<matplotlib.text.Text at 0x1151109e8>

In [ ]:


In [50]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3a.methylation_difference # dependent variable to predict

X = pairs3a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[50]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.203
Model: OLS Adj. R-squared: 0.192
Method: Least Squares F-statistic: 19.25
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.70e-11
Time: 17:36:19 Log-Likelihood: 525.25
No. Observations: 231 AIC: -1043.
Df Residuals: 227 BIC: -1029.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -69.6948 41.613 -1.675 0.095 -151.692 12.302
total_cpg_no_filter 9.608e-08 3.27e-08 2.938 0.004 3.16e-08 1.61e-07
bsRate_mean 69.0365 41.693 1.656 0.099 -13.118 151.191
avgReadCpG_mean 0.1567 0.022 7.143 0.000 0.113 0.200
Omnibus: 7.867 Durbin-Watson: 1.480
Prob(Omnibus): 0.020 Jarque-Bera (JB): 8.184
Skew: 0.457 Prob(JB): 0.0167
Kurtosis: 2.882 Cond. No. 1.03e+10

In [51]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[51]:
<matplotlib.text.Text at 0x115478d30>

In [52]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[52]:
<matplotlib.text.Text at 0x1156c81d0>

In [53]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[53]:
<matplotlib.text.Text at 0x1158d1cc0>

In [ ]:


In [54]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3b.methylation_difference # dependent variable to predict

X = pairs3b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[54]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.427
Model: OLS Adj. R-squared: 0.373
Method: Least Squares F-statistic: 7.955
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000421
Time: 17:36:22 Log-Likelihood: 85.238
No. Observations: 36 AIC: -162.5
Df Residuals: 32 BIC: -156.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 351.9325 196.863 1.788 0.083 -49.064 752.929
total_cpg_no_filter -1.525e-08 2.86e-08 -0.534 0.597 -7.35e-08 4.3e-08
bsRate_mean -353.2137 197.033 -1.793 0.082 -754.558 48.130
avgReadCpG_mean 0.0846 0.072 1.176 0.248 -0.062 0.231
Omnibus: 5.471 Durbin-Watson: 1.941
Prob(Omnibus): 0.065 Jarque-Bera (JB): 3.978
Skew: -0.713 Prob(JB): 0.137
Kurtosis: 3.786 Cond. No. 3.28e+10

In [55]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[55]:
<matplotlib.text.Text at 0x115bfcbe0>

In [56]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[56]:
<matplotlib.text.Text at 0x115ef03c8>

In [57]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[57]:
<matplotlib.text.Text at 0x116200ac8>

In [ ]:


In [ ]:


In [58]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs3c.methylation_difference # dependent variable to predict

X = pairs3c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[58]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.050
Model: OLS Adj. R-squared: 0.035
Method: Least Squares F-statistic: 3.275
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0223
Time: 17:36:25 Log-Likelihood: 525.18
No. Observations: 190 AIC: -1042.
Df Residuals: 186 BIC: -1029.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 69.6294 24.295 2.866 0.005 21.700 117.559
total_cpg_no_filter 2.999e-08 1.92e-08 1.562 0.120 -7.88e-09 6.79e-08
bsRate_mean -69.6733 24.291 -2.868 0.005 -117.595 -21.752
avgReadCpG_mean -0.0264 0.031 -0.862 0.390 -0.087 0.034
Omnibus: 13.454 Durbin-Watson: 1.501
Prob(Omnibus): 0.001 Jarque-Bera (JB): 14.785
Skew: 0.683 Prob(JB): 0.000616
Kurtosis: 3.022 Cond. No. 8.67e+09

In [59]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[59]:
<matplotlib.text.Text at 0x116476fd0>

In [60]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[60]:
<matplotlib.text.Text at 0x11678a320>

In [61]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[61]:
<matplotlib.text.Text at 0x116a07dd8>

In [ ]:


In [ ]:


In [62]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4.methylation_difference # dependent variable to predict

X = pairs4.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[62]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.449
Model: OLS Adj. R-squared: 0.438
Method: Least Squares F-statistic: 40.44
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.54e-19
Time: 17:36:28 Log-Likelihood: 355.10
No. Observations: 153 AIC: -702.2
Df Residuals: 149 BIC: -690.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -62.9953 60.343 -1.044 0.298 -182.235 56.244
total_cpg_no_filter 2.392e-07 3.79e-08 6.306 0.000 1.64e-07 3.14e-07
bsRate_mean 61.1395 60.603 1.009 0.315 -58.612 180.891
avgReadCpG_mean 0.3676 0.041 9.032 0.000 0.287 0.448
Omnibus: 3.265 Durbin-Watson: 1.350
Prob(Omnibus): 0.195 Jarque-Bera (JB): 3.271
Skew: 0.320 Prob(JB): 0.195
Kurtosis: 2.680 Cond. No. 1.42e+10

In [63]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[63]:
<matplotlib.text.Text at 0x116c0f5f8>

In [64]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[64]:
<matplotlib.text.Text at 0x116f55cc0>

In [65]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[65]:
<matplotlib.text.Text at 0x117296198>

In [ ]:


In [66]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4a.methylation_difference # dependent variable to predict

X = pairs4a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[66]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.181
Model: OLS Adj. R-squared: 0.165
Method: Least Squares F-statistic: 10.98
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.48e-06
Time: 17:36:31 Log-Likelihood: 407.34
No. Observations: 153 AIC: -806.7
Df Residuals: 149 BIC: -794.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -128.4435 51.809 -2.479 0.014 -230.819 -26.068
total_cpg_no_filter 1.846e-07 3.74e-08 4.940 0.000 1.11e-07 2.58e-07
bsRate_mean 128.1483 51.942 2.467 0.015 25.510 230.787
avgReadCpG_mean 0.1060 0.041 2.559 0.011 0.024 0.188
Omnibus: 6.165 Durbin-Watson: 1.471
Prob(Omnibus): 0.046 Jarque-Bera (JB): 6.269
Skew: 0.494 Prob(JB): 0.0435
Kurtosis: 2.913 Cond. No. 1.55e+10

In [67]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[67]:
<matplotlib.text.Text at 0x1174d9f60>

In [68]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[68]:
<matplotlib.text.Text at 0x1177543c8>

In [69]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[69]:
<matplotlib.text.Text at 0x1179a5c50>

In [ ]:


In [70]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4b.methylation_difference # dependent variable to predict

X = pairs4b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[70]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.325
Model: OLS Adj. R-squared: 0.309
Method: Least Squares F-statistic: 21.17
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.95e-11
Time: 17:36:33 Log-Likelihood: 308.35
No. Observations: 136 AIC: -608.7
Df Residuals: 132 BIC: -597.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 13.8277 66.836 0.207 0.836 -118.380 146.036
total_cpg_no_filter 2.966e-07 5.5e-08 5.389 0.000 1.88e-07 4.05e-07
bsRate_mean -15.6485 66.962 -0.234 0.816 -148.106 116.809
avgReadCpG_mean 0.3265 0.042 7.748 0.000 0.243 0.410
Omnibus: 4.240 Durbin-Watson: 1.349
Prob(Omnibus): 0.120 Jarque-Bera (JB): 4.280
Skew: 0.425 Prob(JB): 0.118
Kurtosis: 2.816 Cond. No. 1.27e+10

In [71]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[71]:
<matplotlib.text.Text at 0x117db2710>

In [72]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[72]:
<matplotlib.text.Text at 0x116fd2208>

In [73]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[73]:
<matplotlib.text.Text at 0x1182c1e10>

In [ ]:


In [74]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs4c.methylation_difference # dependent variable to predict

X = pairs4c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[74]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.149
Model: OLS Adj. R-squared: 0.136
Method: Least Squares F-statistic: 12.00
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.85e-07
Time: 17:36:37 Log-Likelihood: 460.90
No. Observations: 210 AIC: -913.8
Df Residuals: 206 BIC: -900.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -209.1481 116.954 -1.788 0.075 -439.729 21.432
total_cpg_no_filter 1.36e-07 4.33e-08 3.139 0.002 5.06e-08 2.21e-07
bsRate_mean 208.5425 117.169 1.780 0.077 -22.461 439.546
avgReadCpG_mean 0.2065 0.037 5.553 0.000 0.133 0.280
Omnibus: 4.780 Durbin-Watson: 2.107
Prob(Omnibus): 0.092 Jarque-Bera (JB): 4.776
Skew: 0.335 Prob(JB): 0.0918
Kurtosis: 2.689 Cond. No. 3.04e+10

In [75]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[75]:
<matplotlib.text.Text at 0x1186204e0>

In [76]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[76]:
<matplotlib.text.Text at 0x1188025f8>

In [77]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[77]:
<matplotlib.text.Text at 0x118a47048>

In [ ]:


In [78]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5.methylation_difference # dependent variable to predict

X = pairs5.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[78]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.076
Model: OLS Adj. R-squared: 0.062
Method: Least Squares F-statistic: 5.634
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000992
Time: 17:36:40 Log-Likelihood: 457.17
No. Observations: 210 AIC: -906.3
Df Residuals: 206 BIC: -893.0
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -8.4475 45.518 -0.186 0.853 -98.189 81.294
total_cpg_no_filter -3.266e-08 1.14e-08 -2.876 0.004 -5.5e-08 -1.03e-08
bsRate_mean 7.9526 45.639 0.174 0.862 -82.028 97.933
avgReadCpG_mean 0.1082 0.031 3.507 0.001 0.047 0.169
Omnibus: 20.959 Durbin-Watson: 1.419
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.175
Skew: 0.811 Prob(JB): 5.63e-06
Kurtosis: 3.366 Cond. No. 1.48e+10

In [79]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[79]:
<matplotlib.text.Text at 0x118dc9ac8>

In [80]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[80]:
<matplotlib.text.Text at 0x118fe9198>

In [81]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[81]:
<matplotlib.text.Text at 0x11922b550>

In [ ]:


In [82]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5a.methylation_difference # dependent variable to predict

X = pairs5a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[82]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.141
Model: OLS Adj. R-squared: 0.127
Method: Least Squares F-statistic: 10.16
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.15e-06
Time: 17:36:43 Log-Likelihood: 443.88
No. Observations: 190 AIC: -879.8
Df Residuals: 186 BIC: -866.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -49.1048 26.379 -1.862 0.064 -101.145 2.935
total_cpg_no_filter -3.538e-08 7.7e-09 -4.593 0.000 -5.06e-08 -2.02e-08
bsRate_mean 48.8742 26.485 1.845 0.067 -3.376 101.124
avgReadCpG_mean 0.0784 0.029 2.748 0.007 0.022 0.135
Omnibus: 15.560 Durbin-Watson: 1.021
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.882
Skew: 0.702 Prob(JB): 0.000216
Kurtosis: 3.401 Cond. No. 1.63e+10

In [83]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[83]:
<matplotlib.text.Text at 0x119675cf8>

In [84]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[84]:
<matplotlib.text.Text at 0x11974d7f0>

In [85]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[85]:
<matplotlib.text.Text at 0x1199a28d0>

In [ ]:


In [86]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5b.methylation_difference # dependent variable to predict

X = pairs5b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[86]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.136
Model: OLS Adj. R-squared: 0.125
Method: Least Squares F-statistic: 11.93
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.77e-07
Time: 17:36:46 Log-Likelihood: 562.92
No. Observations: 231 AIC: -1118.
Df Residuals: 227 BIC: -1104.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 177.7968 55.838 3.184 0.002 67.771 287.823
total_cpg_no_filter -6.264e-09 8.38e-09 -0.748 0.455 -2.28e-08 1.02e-08
bsRate_mean -177.3583 55.935 -3.171 0.002 -287.577 -67.139
avgReadCpG_mean -0.1358 0.032 -4.243 0.000 -0.199 -0.073
Omnibus: 12.674 Durbin-Watson: 1.477
Prob(Omnibus): 0.002 Jarque-Bera (JB): 13.533
Skew: 0.592 Prob(JB): 0.00115
Kurtosis: 3.068 Cond. No. 2.98e+10

In [87]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[87]:
<matplotlib.text.Text at 0x119e13390>

In [88]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[88]:
<matplotlib.text.Text at 0x11a026358>

In [89]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[89]:
<matplotlib.text.Text at 0x11a33fac8>

In [ ]:


In [90]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs5c.methylation_difference # dependent variable to predict

X = pairs5c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[90]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.038
Model: OLS Adj. R-squared: 0.023
Method: Least Squares F-statistic: 2.465
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0637
Time: 17:36:49 Log-Likelihood: 432.69
No. Observations: 190 AIC: -857.4
Df Residuals: 186 BIC: -844.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 71.5955 50.953 1.405 0.162 -28.924 172.115
total_cpg_no_filter -6.057e-09 9.78e-09 -0.619 0.536 -2.53e-08 1.32e-08
bsRate_mean -72.0396 51.078 -1.410 0.160 -172.805 28.726
avgReadCpG_mean 0.0638 0.025 2.541 0.012 0.014 0.113
Omnibus: 9.219 Durbin-Watson: 1.561
Prob(Omnibus): 0.010 Jarque-Bera (JB): 9.789
Skew: 0.551 Prob(JB): 0.00749
Kurtosis: 2.856 Cond. No. 1.62e+10

In [91]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[91]:
<matplotlib.text.Text at 0x11a69b160>

In [92]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[92]:
<matplotlib.text.Text at 0x11a8f7d30>

In [93]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[93]:
<matplotlib.text.Text at 0x11ace6d30>

In [ ]:


In [94]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6.methylation_difference # dependent variable to predict

X = pairs6.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[94]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.063
Model: OLS Adj. R-squared: 0.046
Method: Least Squares F-statistic: 3.757
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0121
Time: 17:36:52 Log-Likelihood: 475.19
No. Observations: 171 AIC: -942.4
Df Residuals: 167 BIC: -929.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 6.6360 4.178 1.588 0.114 -1.613 14.885
total_cpg_no_filter 1.37e-07 4.47e-08 3.061 0.003 4.86e-08 2.25e-07
bsRate_mean -6.8327 4.359 -1.568 0.119 -15.438 1.772
avgReadCpG_mean -0.0172 0.028 -0.623 0.534 -0.072 0.037
Omnibus: 14.882 Durbin-Watson: 1.095
Prob(Omnibus): 0.001 Jarque-Bera (JB): 16.959
Skew: 0.771 Prob(JB): 0.000208
Kurtosis: 2.946 Cond. No. 9.64e+08

In [95]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[95]:
<matplotlib.text.Text at 0x11b0320b8>

In [96]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[96]:
<matplotlib.text.Text at 0x11b2882e8>

In [97]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[97]:
<matplotlib.text.Text at 0x11a9f7e10>

In [ ]:


In [98]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, methylation_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6a.methylation_difference # dependent variable to predict

X = pairs6a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[98]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.005
Model: OLS Adj. R-squared: -0.011
Method: Least Squares F-statistic: 0.3117
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.817
Time: 17:36:55 Log-Likelihood: 409.72
No. Observations: 190 AIC: -811.4
Df Residuals: 186 BIC: -798.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -4.8241 18.081 -0.267 0.790 -40.495 30.847
total_cpg_no_filter 1.745e-08 2.34e-08 0.745 0.457 -2.88e-08 6.37e-08
bsRate_mean 5.0309 18.951 0.265 0.791 -32.356 42.418
avgReadCpG_mean 0.0045 0.039 0.116 0.908 -0.072 0.081
Omnibus: 11.921 Durbin-Watson: 1.754
Prob(Omnibus): 0.003 Jarque-Bera (JB): 12.286
Skew: 0.586 Prob(JB): 0.00215
Kurtosis: 2.578 Cond. No. 4.74e+09

In [99]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[99]:
<matplotlib.text.Text at 0x11b7b9d68>

In [100]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[100]:
<matplotlib.text.Text at 0x11baa4ef0>

In [101]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[101]:
<matplotlib.text.Text at 0x11bce4550>

In [ ]:


In [102]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, methylation_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6b.methylation_difference # dependent variable to predict

X = pairs6b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[102]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.015
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.8613
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.462
Time: 17:36:58 Log-Likelihood: 369.54
No. Observations: 171 AIC: -731.1
Df Residuals: 167 BIC: -718.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.6788 17.936 0.038 0.970 -34.732 36.090
total_cpg_no_filter -2.826e-08 1.91e-08 -1.477 0.142 -6.6e-08 9.52e-09
bsRate_mean -0.7185 18.767 -0.038 0.970 -37.770 36.333
avgReadCpG_mean 0.0129 0.055 0.234 0.815 -0.096 0.122
Omnibus: 12.205 Durbin-Watson: 1.343
Prob(Omnibus): 0.002 Jarque-Bera (JB): 13.443
Skew: 0.673 Prob(JB): 0.00120
Kurtosis: 2.726 Cond. No. 7.61e+09

In [103]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[103]:
<matplotlib.text.Text at 0x11bf4b630>

In [104]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[104]:
<matplotlib.text.Text at 0x11c19d160>

In [105]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[105]:
<matplotlib.text.Text at 0x11c551d30>

In [ ]:


In [106]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, methylation_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6c.methylation_difference # dependent variable to predict

X = pairs6c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[106]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.012
Model: OLS Adj. R-squared: -0.006
Method: Least Squares F-statistic: 0.6705
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.571
Time: 17:37:01 Log-Likelihood: 401.10
No. Observations: 171 AIC: -794.2
Df Residuals: 167 BIC: -781.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 7.9337 10.359 0.766 0.445 -12.519 28.386
total_cpg_no_filter 1.452e-09 1.42e-08 0.102 0.919 -2.67e-08 2.96e-08
bsRate_mean -8.0530 10.819 -0.744 0.458 -29.413 13.307
avgReadCpG_mean -0.0287 0.034 -0.838 0.403 -0.096 0.039
Omnibus: 11.265 Durbin-Watson: 1.720
Prob(Omnibus): 0.004 Jarque-Bera (JB): 12.327
Skew: 0.649 Prob(JB): 0.00210
Kurtosis: 2.787 Cond. No. 5.21e+09

In [107]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[107]:
<matplotlib.text.Text at 0x11c7a90f0>

In [108]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[108]:
<matplotlib.text.Text at 0x11ca04710>

In [109]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[109]:
<matplotlib.text.Text at 0x11cd0a9e8>

In [ ]:


In [110]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, methylation_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6d.methylation_difference # dependent variable to predict

X = pairs6d.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[110]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.059
Model: OLS Adj. R-squared: 0.042
Method: Least Squares F-statistic: 3.514
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0165
Time: 17:37:03 Log-Likelihood: 365.59
No. Observations: 171 AIC: -723.2
Df Residuals: 167 BIC: -710.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 33.2841 14.281 2.331 0.021 5.090 61.478
total_cpg_no_filter 4.045e-08 2.62e-08 1.546 0.124 -1.12e-08 9.21e-08
bsRate_mean -35.7711 15.084 -2.371 0.019 -65.552 -5.991
avgReadCpG_mean 0.2034 0.064 3.155 0.002 0.076 0.331
Omnibus: 11.170 Durbin-Watson: 2.004
Prob(Omnibus): 0.004 Jarque-Bera (JB): 12.206
Skew: 0.650 Prob(JB): 0.00224
Kurtosis: 2.843 Cond. No. 4.87e+09

In [111]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[111]:
<matplotlib.text.Text at 0x11d047da0>

In [112]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[112]:
<matplotlib.text.Text at 0x11d33b668>

In [113]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[113]:
<matplotlib.text.Text at 0x11d57d9e8>

In [ ]:


In [ ]:


In [ ]:


In [114]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, methylation_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})

y = pairs6e.methylation_difference # dependent variable to predict

X = pairs6e.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[114]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.114
Model: OLS Adj. R-squared: 0.088
Method: Least Squares F-statistic: 4.338
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00642
Time: 17:37:06 Log-Likelihood: 219.02
No. Observations: 105 AIC: -430.0
Df Residuals: 101 BIC: -419.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 3.3101 26.278 0.126 0.900 -48.819 55.439
total_cpg_no_filter -1.492e-08 3.93e-08 -0.380 0.705 -9.28e-08 6.3e-08
bsRate_mean -1.0832 27.204 -0.040 0.968 -55.049 52.882
avgReadCpG_mean -0.4169 0.119 -3.496 0.001 -0.653 -0.180
Omnibus: 6.823 Durbin-Watson: 1.878
Prob(Omnibus): 0.033 Jarque-Bera (JB): 6.869
Skew: 0.626 Prob(JB): 0.0322
Kurtosis: 2.985 Cond. No. 5.84e+09

In [115]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[115]:
<matplotlib.text.Text at 0x11d8bb780>

In [116]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[116]:
<matplotlib.text.Text at 0x11da5a3c8>

In [117]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[117]:
<matplotlib.text.Text at 0x11dca0470>

In [118]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)


(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)

In [119]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

In [120]:
total_pairs = pd.concat(frames)

In [121]:
total_pairs.shape


Out[121]:
(4035, 15)

In [122]:
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)

In [123]:
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
Out[123]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.088
Model: OLS Adj. R-squared: 0.087
Method: Least Squares F-statistic: 97.00
Date: Tue, 09 Aug 2016 Prob (F-statistic): 6.44e-79
Time: 17:37:09 Log-Likelihood: 9196.8
No. Observations: 4035 AIC: -1.838e+04
Df Residuals: 4030 BIC: -1.835e+04
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -0.1670 0.034 -4.902 0.000 -0.234 -0.100
total_cpg_no_filter -2.27e-08 2.15e-09 -10.561 0.000 -2.69e-08 -1.85e-08
bsRate_mean -0.1249 0.025 -4.917 0.000 -0.175 -0.075
avgReadCpG_mean 0.0624 0.004 14.144 0.000 0.054 0.071
type_CLL -0.0111 0.001 -10.501 0.000 -0.013 -0.009
Omnibus: 455.342 Durbin-Watson: 1.452
Prob(Omnibus): 0.000 Jarque-Bera (JB): 634.742
Skew: 0.883 Prob(JB): 1.47e-138
Kurtosis: 3.809 Cond. No. 4.97e+07

In [124]:
X.head()


Out[124]:
const total_cpg_no_filter bsRate_mean avgReadCpG_mean type_CLL
0 1 709541.5 0.980181 5.354731 1.0
1 1 786284.0 0.980293 5.367319 1.0
2 1 683623.5 0.980268 5.369633 1.0
3 1 853200.5 0.980262 5.367604 1.0
4 1 842686.5 0.980228 5.374553 1.0

In [125]:
print("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")


methylation_difference vs. bsRate, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[125]:
<matplotlib.text.Text at 0x11dff7b70>

In [126]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[126]:
<matplotlib.text.Text at 0x11e3558d0>

In [127]:
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[127]:
<matplotlib.text.Text at 0x11e6585f8>

In [128]:
print("methylation_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. bsRate, by type, CLL vs normal")


methylation_difference vs. bsRate, by type! 
Out[128]:
<matplotlib.text.Text at 0x11ea20e80>

In [129]:
print("methylation_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")


methylation_difference vs. avgReadCpG per cell, by type! 
Out[129]:
<matplotlib.text.Text at 0x11eb58ba8>

In [130]:
print("methylation_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. total unique CpG per cell, by type, CLL vs Normal ")


methylation_difference vs. total unique CpG per cel, by type!
Out[130]:
<matplotlib.text.Text at 0x11ed27a20>

In [131]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.896636567631

In [132]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)


(990, 15)
(3045, 15)

In [133]:
#
# CLL first
#
y = cll_pairs.methylation_difference # dependent variable
X = cll_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(990, 4)
Out[133]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.222
Model: OLS Adj. R-squared: 0.219
Method: Least Squares F-statistic: 93.70
Date: Tue, 09 Aug 2016 Prob (F-statistic): 2.35e-53
Time: 17:39:32 Log-Likelihood: 2553.7
No. Observations: 990 AIC: -5099.
Df Residuals: 986 BIC: -5080.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -0.1367 0.117 -1.164 0.245 -0.367 0.094
total_cpg_no_filter -4.576e-08 5e-09 -9.152 0.000 -5.56e-08 -3.59e-08
bsRate_mean -0.1544 0.112 -1.376 0.169 -0.375 0.066
avgReadCpG_mean 0.0624 0.008 7.653 0.000 0.046 0.078
Omnibus: 61.542 Durbin-Watson: 1.710
Prob(Omnibus): 0.000 Jarque-Bera (JB): 77.273
Skew: 0.570 Prob(JB): 1.66e-17
Kurtosis: 3.757 Cond. No. 1.63e+08

In [134]:
print("methylation_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  CLL pairs ")


methylation_difference vs. bsRate, jointplot, CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[134]:
<matplotlib.text.Text at 0x193981630>

In [135]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[135]:
<matplotlib.text.Text at 0x1ed6172e8>

In [136]:
print("methylation diference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell,  CLL pairs")


methylation diference vs. total # unique CpGs per cell, CLL pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[136]:
<matplotlib.text.Text at 0x231506668>

In [137]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.921781235455

In [ ]:


In [138]:
#
# normal
#
y = normal_pairs.methylation_difference # dependent variable
X = normal_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation
(3045, 4)
Out[138]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.040
Model: OLS Adj. R-squared: 0.039
Method: Least Squares F-statistic: 42.30
Date: Tue, 09 Aug 2016 Prob (F-statistic): 8.94e-27
Time: 17:40:17 Log-Likelihood: 6751.1
No. Observations: 3045 AIC: -1.349e+04
Df Residuals: 3041 BIC: -1.347e+04
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -0.1901 0.038 -4.979 0.000 -0.265 -0.115
total_cpg_no_filter -1.431e-08 2.68e-09 -5.341 0.000 -1.96e-08 -9.06e-09
bsRate_mean -0.0775 0.028 -2.725 0.006 -0.133 -0.022
avgReadCpG_mean 0.0573 0.005 11.053 0.000 0.047 0.067
Omnibus: 328.375 Durbin-Watson: 1.437
Prob(Omnibus): 0.000 Jarque-Bera (JB): 441.982
Skew: 0.883 Prob(JB): 1.06e-96
Kurtosis: 3.601 Cond. No. 4.12e+07

In [139]:
print("methylation_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal pairs ")


methylation_difference vs. bsRate, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[139]:
<matplotlib.text.Text at 0x11f42f7b8>

In [140]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[140]:
<matplotlib.text.Text at 0x11f864e48>

In [141]:
print("methylation diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, Normal pairs")


methylation diference vs. total # unique CpGs per cell, Normal pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[141]:
<matplotlib.text.Text at 0x11fdca0b8>

In [142]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.888251739861

In [ ]:


In [ ]:


In [ ]: