In [1]:
%matplotlib inline

In [2]:
#
# Covariates are: 
#   - Number of unique CpGs per cell
#   - Median Average Read CpG per cell (or mean if normally distributed)
#   - BS rate per cell
#   - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs), 
# the covariates are the mean between the two pairs. 
#

In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm

In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")

In [5]:
stats.shape


Out[5]:
(438, 15)

In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [7]:
len(normal)


Out[7]:
336

In [8]:
len(CLL)


Out[8]:
102

In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")

In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)

In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])

In [12]:
cpg_total.shape


Out[12]:
(513, 4)

In [13]:
merged = stats.merge(cpg_total, on="filename")

In [14]:
merged.shape


Out[14]:
(438, 18)

In [15]:
merged.head()


Out[15]:
filename methylation PDR_total methylation_unweighted PDR_unweighted thisMeth mixedReadCount total_reads type bio protocol total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 bsRate avgReadCpgs_nofilter avgReadCpgs_lessthan1CpG avgReadCpgs_gtreql3.8CpG
0 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CATGAC 0.529505 0.235795 0.632802 0.231878 2208325.0 983394.0 4170549.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 525282.0 525251.0 435636.0 0.9975 5.354284 5.355660 7.019255
1 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CCTTCG 0.455550 0.177631 0.583859 0.175371 733064.0 285841.0 1609185.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 221972.0 221962.0 186757.0 0.9975 5.587294 5.588449 7.302612
2 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CGGTAG 0.515269 0.177645 0.618578 0.174221 1452802.0 500870.0 2819500.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 355730.0 355713.0 295624.0 0.9975 5.393199 5.394331 7.079288
3 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTATTG 0.556175 0.176367 0.652727 0.172273 2279354.0 722800.0 4098270.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 483179.0 483150.0 397812.0 0.9975 5.287116 5.288477 6.979525
4 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.CTCAGC 0.528642 0.181331 0.640401 0.172287 1394208.0 478231.0 2637340.0 normal CD19CD27m NormalBCD19pCD27mcell67_88 356122.0 356100.0 294065.0 0.9975 5.314302 5.315719 6.995052

In [16]:
sns.lmplot(x="bsRate", y="methylation",  data=merged)
plt.title("methylation (weighted) vs bisulfite conversion rate")


Out[16]:
<matplotlib.text.Text at 0x109515160>

In [17]:
sns.lmplot(x="bsRate", y="methylation_unweighted",  data=merged)
plt.title("methylation (unweighted) vs bisulfite conversion rate")


Out[17]:
<matplotlib.text.Text at 0x1095de4a8>

In [18]:
sns.lmplot(x="bsRate", y="PDR_total",  data=merged)
plt.title("PDR (weighted) vs bisulfite conversion rate")


Out[18]:
<matplotlib.text.Text at 0x10a00c048>

In [19]:
sns.lmplot(x="bsRate", y="PDR_unweighted",  data=merged)
plt.title("PDR (unweighted) vs bisulfite conversion rate")


Out[19]:
<matplotlib.text.Text at 0x10a110b00>

In [20]:
sns.lmplot(x="bsRate", y="total_cpg_no_filter",  data=merged)
plt.title("Total # unique CpGs per cell (no filter) vs bisulfite conversion rate")


Out[20]:
<matplotlib.text.Text at 0x10a219908>

In [21]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan1",  data=merged)
plt.title("Total # unique CpGs per cell (filter: > 1) vs bisulfite conversion rate")


Out[21]:
<matplotlib.text.Text at 0x10a31fcf8>

In [22]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan38",  data=merged)
plt.title("Total # unique CpGs per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[22]:
<matplotlib.text.Text at 0x10a4256d8>

In [23]:
sns.lmplot(x="bsRate", y="avgReadCpgs_nofilter",  data=merged)
plt.title("Mean avgCpG read per cell (no filter) vs bisulfite conversion rate")


Out[23]:
<matplotlib.text.Text at 0x10a5b4a20>

In [24]:
sns.lmplot(x="bsRate", y="avgReadCpgs_lessthan1CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: > 1) vs bisulfite conversion rate")


Out[24]:
<matplotlib.text.Text at 0x10a6b7a20>

In [25]:
sns.lmplot(x="bsRate", y="avgReadCpgs_gtreql3.8CpG",  data=merged)
plt.title("Mean avgCpG read per cell (filter: >= 3.8) vs bisulfite conversion rate")


Out[25]:
<matplotlib.text.Text at 0x10a842358>

In [ ]:


In [ ]:


In [26]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation_unweighted, tritopool.methylation_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1.methylation_difference # dependent variable to predict

X = pairs1.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[26]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.021
Model: OLS Adj. R-squared: 0.006
Method: Least Squares F-statistic: 1.438
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.233
Time: 18:36:09 Log-Likelihood: 802.67
No. Observations: 210 AIC: -1597.
Df Residuals: 206 BIC: -1584.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -2.0577 5.118 -0.402 0.688 -12.148 8.032
total_cpg_no_filter -4.351e-09 5.06e-09 -0.861 0.390 -1.43e-08 5.62e-09
bsRate_mean 1.9206 5.218 0.368 0.713 -8.368 12.209
avgReadCpG_mean 0.0265 0.013 2.045 0.042 0.001 0.052
Omnibus: 13.602 Durbin-Watson: 2.350
Prob(Omnibus): 0.001 Jarque-Bera (JB): 11.078
Skew: 0.471 Prob(JB): 0.00393
Kurtosis: 2.384 Cond. No. 1.46e+10

In [27]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<matplotlib.text.Text at 0x10aa3d940>

In [28]:
print("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[28]:
<matplotlib.text.Text at 0x10ac99470>

In [29]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<matplotlib.text.Text at 0x10b065860>

In [30]:
tritopool = merged[merged["protocol"] == 'trito_pool_2']       # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation_unweighted, tritopool.methylation_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs1a.methylation_difference # dependent variable to predict

X = pairs1a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[30]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.221
Model: OLS Adj. R-squared: 0.210
Method: Least Squares F-statistic: 19.50
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.60e-11
Time: 18:36:13 Log-Likelihood: 829.60
No. Observations: 210 AIC: -1651.
Df Residuals: 206 BIC: -1638.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -35.0826 5.224 -6.716 0.000 -45.381 -24.784
total_cpg_no_filter -2.213e-08 4.16e-09 -5.322 0.000 -3.03e-08 -1.39e-08
bsRate_mean 36.1434 5.417 6.672 0.000 25.463 46.824
avgReadCpG_mean -0.0227 0.013 -1.735 0.084 -0.049 0.003
Omnibus: 7.518 Durbin-Watson: 1.911
Prob(Omnibus): 0.023 Jarque-Bera (JB): 6.368
Skew: 0.342 Prob(JB): 0.0414
Kurtosis: 2.491 Cond. No. 1.68e+10

In [31]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[31]:
<matplotlib.text.Text at 0x10b2c3f28>

In [32]:
print("methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[32]:
<matplotlib.text.Text at 0x10b5de6d8>

In [33]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")


methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[33]:
<matplotlib.text.Text at 0x10b8ff908>

In [ ]:


In [34]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2.methylation_difference # dependent variable to predict

X = pairs2.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[34]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.360
Model: OLS Adj. R-squared: 0.350
Method: Least Squares F-statistic: 34.89
Date: Tue, 09 Aug 2016 Prob (F-statistic): 6.18e-18
Time: 18:36:17 Log-Likelihood: 500.87
No. Observations: 190 AIC: -993.7
Df Residuals: 186 BIC: -980.7
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 19.0220 7.679 2.477 0.014 3.872 34.172
total_cpg_no_filter -8.486e-08 1.3e-08 -6.521 0.000 -1.11e-07 -5.92e-08
bsRate_mean -21.0470 8.039 -2.618 0.010 -36.907 -5.187
avgReadCpG_mean 0.1779 0.024 7.285 0.000 0.130 0.226
Omnibus: 6.478 Durbin-Watson: 1.759
Prob(Omnibus): 0.039 Jarque-Bera (JB): 8.887
Skew: 0.186 Prob(JB): 0.0118
Kurtosis: 3.992 Cond. No. 3.27e+09

In [35]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[35]:
<matplotlib.text.Text at 0x10bba5048>

In [36]:
print("methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease")


methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[36]:
<matplotlib.text.Text at 0x10bd8ecf8>

In [37]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[37]:
<matplotlib.text.Text at 0x10c0b6668>

In [38]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2a.methylation_difference # dependent variable to predict

X = pairs2a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[38]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.189
Model: OLS Adj. R-squared: 0.176
Method: Least Squares F-statistic: 14.44
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.70e-08
Time: 18:36:21 Log-Likelihood: 511.23
No. Observations: 190 AIC: -1014.
Df Residuals: 186 BIC: -1001.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -22.7365 8.065 -2.819 0.005 -38.648 -6.825
total_cpg_no_filter -4.959e-08 9.97e-09 -4.976 0.000 -6.92e-08 -2.99e-08
bsRate_mean 23.4179 8.463 2.767 0.006 6.723 40.113
avgReadCpG_mean 0.0395 0.016 2.392 0.018 0.007 0.072
Omnibus: 2.971 Durbin-Watson: 2.181
Prob(Omnibus): 0.226 Jarque-Bera (JB): 2.767
Skew: 0.223 Prob(JB): 0.251
Kurtosis: 2.612 Cond. No. 4.47e+09

In [39]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. bsRate, jointplot,  CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[39]:
<matplotlib.text.Text at 0x10c367240>

In [40]:
print("methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_Tris_protease_GR")


methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[40]:
<matplotlib.text.Text at 0x10c57a780>

In [41]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[41]:
<matplotlib.text.Text at 0x10c8a5d68>

In [ ]:


In [ ]:


In [42]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation_unweighted, cw154.methylation_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs2b.methylation_difference # dependent variable to predict

X = pairs2b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[42]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.123
Model: OLS Adj. R-squared: 0.109
Method: Least Squares F-statistic: 8.711
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.95e-05
Time: 18:36:24 Log-Likelihood: 601.57
No. Observations: 190 AIC: -1195.
Df Residuals: 186 BIC: -1182.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -21.9000 6.329 -3.460 0.001 -34.385 -9.415
total_cpg_no_filter -3.642e-08 7.86e-09 -4.631 0.000 -5.19e-08 -2.09e-08
bsRate_mean 22.5194 6.561 3.432 0.001 9.575 35.463
avgReadCpG_mean 0.0319 0.021 1.491 0.138 -0.010 0.074
Omnibus: 7.973 Durbin-Watson: 1.469
Prob(Omnibus): 0.019 Jarque-Bera (JB): 8.280
Skew: 0.490 Prob(JB): 0.0159
Kurtosis: 2.708 Cond. No. 6.96e+09

In [43]:
print("methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. bsRate, jointplot,  CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[43]:
<matplotlib.text.Text at 0x10cb3fb38>

In [44]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154  CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[44]:
<matplotlib.text.Text at 0x10cd9da90>

In [45]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")


methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[45]:
<matplotlib.text.Text at 0x10cfb8160>

In [ ]:


In [46]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3.methylation_difference # dependent variable to predict

X = pairs3.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[46]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.113
Model: OLS Adj. R-squared: 0.093
Method: Least Squares F-statistic: 5.591
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00122
Time: 18:36:28 Log-Likelihood: 311.36
No. Observations: 136 AIC: -614.7
Df Residuals: 132 BIC: -603.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -138.4777 36.215 -3.824 0.000 -210.115 -66.840
total_cpg_no_filter 3.22e-08 4.53e-08 0.710 0.479 -5.75e-08 1.22e-07
bsRate_mean 138.3859 36.307 3.812 0.000 66.567 210.204
avgReadCpG_mean 0.0696 0.047 1.478 0.142 -0.024 0.163
Omnibus: 10.220 Durbin-Watson: 1.370
Prob(Omnibus): 0.006 Jarque-Bera (JB): 10.610
Skew: 0.679 Prob(JB): 0.00497
Kurtosis: 3.169 Cond. No. 7.56e+09

In [47]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[47]:
<matplotlib.text.Text at 0x10d30ac18>

In [48]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[48]:
<matplotlib.text.Text at 0x10d47f1d0>

In [49]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[49]:
<matplotlib.text.Text at 0x10d860fd0>

In [ ]:


In [50]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3a.methylation_difference # dependent variable to predict

X = pairs3a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[50]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.209
Model: OLS Adj. R-squared: 0.198
Method: Least Squares F-statistic: 19.94
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.65e-11
Time: 18:36:31 Log-Likelihood: 564.16
No. Observations: 231 AIC: -1120.
Df Residuals: 227 BIC: -1107.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -74.2043 35.414 -2.095 0.037 -143.986 -4.423
total_cpg_no_filter 2.967e-08 2.72e-08 1.092 0.276 -2.39e-08 8.32e-08
bsRate_mean 73.6724 35.475 2.077 0.039 3.771 143.574
avgReadCpG_mean 0.1049 0.016 6.373 0.000 0.072 0.137
Omnibus: 3.491 Durbin-Watson: 1.732
Prob(Omnibus): 0.175 Jarque-Bera (JB): 3.425
Skew: 0.252 Prob(JB): 0.180
Kurtosis: 2.681 Cond. No. 1.03e+10

In [51]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[51]:
<matplotlib.text.Text at 0x10dae2668>

In [52]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[52]:
<matplotlib.text.Text at 0x10daa49e8>

In [53]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell23_44")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[53]:
<matplotlib.text.Text at 0x10e02e3c8>

In [ ]:


In [54]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3b.methylation_difference # dependent variable to predict

X = pairs3b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


9
(36, 14)
(36,)
(36, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[54]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.400
Model: OLS Adj. R-squared: 0.344
Method: Least Squares F-statistic: 7.110
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000860
Time: 18:36:35 Log-Likelihood: 97.213
No. Observations: 36 AIC: -186.4
Df Residuals: 32 BIC: -180.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 223.0500 141.234 1.579 0.124 -64.635 510.735
total_cpg_no_filter 9.207e-09 2.01e-08 0.457 0.651 -3.18e-08 5.02e-08
bsRate_mean -223.9858 141.323 -1.585 0.123 -511.852 63.880
avgReadCpG_mean 0.0578 0.045 1.277 0.211 -0.034 0.150
Omnibus: 1.776 Durbin-Watson: 2.473
Prob(Omnibus): 0.412 Jarque-Bera (JB): 1.652
Skew: -0.481 Prob(JB): 0.438
Kurtosis: 2.581 Cond. No. 3.29e+10

In [55]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[55]:
<matplotlib.text.Text at 0x10e29b630>

In [56]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[56]:
<matplotlib.text.Text at 0x10e4bfe10>

In [57]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell45_66")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[57]:
<matplotlib.text.Text at 0x10e7df8d0>

In [ ]:


In [ ]:


In [58]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation_unweighted, pcell.methylation_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs3c.methylation_difference # dependent variable to predict

X = pairs3c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[58]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.078
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 5.265
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00165
Time: 18:36:38 Log-Likelihood: 524.46
No. Observations: 190 AIC: -1041.
Df Residuals: 186 BIC: -1028.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 63.6950 23.866 2.669 0.008 16.612 110.778
total_cpg_no_filter 6.236e-08 1.96e-08 3.177 0.002 2.36e-08 1.01e-07
bsRate_mean -63.8511 23.860 -2.676 0.008 -110.923 -16.780
avgReadCpG_mean -0.0030 0.026 -0.119 0.905 -0.054 0.047
Omnibus: 23.349 Durbin-Watson: 0.996
Prob(Omnibus): 0.000 Jarque-Bera (JB): 28.548
Skew: 0.943 Prob(JB): 6.32e-07
Kurtosis: 3.218 Cond. No. 8.48e+09

In [59]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[59]:
<matplotlib.text.Text at 0x10ea6f278>

In [60]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[60]:
<matplotlib.text.Text at 0x10ed740b8>

In [61]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27pcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[61]:
<matplotlib.text.Text at 0x10ef9da58>

In [ ]:


In [ ]:


In [62]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4.methylation_difference # dependent variable to predict

X = pairs4.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[62]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.522
Model: OLS Adj. R-squared: 0.513
Method: Least Squares F-statistic: 54.33
Date: Tue, 09 Aug 2016 Prob (F-statistic): 8.74e-24
Time: 18:36:42 Log-Likelihood: 394.90
No. Observations: 153 AIC: -781.8
Df Residuals: 149 BIC: -769.7
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -7.1901 49.960 -0.144 0.886 -105.912 91.532
total_cpg_no_filter 1.496e-07 2.75e-08 5.436 0.000 9.52e-08 2.04e-07
bsRate_mean 5.1867 50.207 0.103 0.918 -94.024 104.397
avgReadCpG_mean 0.2839 0.029 9.804 0.000 0.227 0.341
Omnibus: 11.256 Durbin-Watson: 1.378
Prob(Omnibus): 0.004 Jarque-Bera (JB): 4.637
Skew: 0.120 Prob(JB): 0.0984
Kurtosis: 2.181 Cond. No. 1.52e+10

In [63]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[63]:
<matplotlib.text.Text at 0x10f1fc588>

In [64]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[64]:
<matplotlib.text.Text at 0x10f46d2e8>

In [65]:
print("methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_")


methylation_difference vs. total unique CpGs, jointplot,  Normal 'NormalBCD19pCD27mcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[65]:
<matplotlib.text.Text at 0x10f782cf8>

In [ ]:


In [66]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4a.methylation_difference # dependent variable to predict

X = pairs4a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


18
(153, 14)
(153,)
(153, 3)
Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[66]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.126
Model: OLS Adj. R-squared: 0.108
Method: Least Squares F-statistic: 7.145
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000162
Time: 18:36:46 Log-Likelihood: 462.75
No. Observations: 153 AIC: -917.5
Df Residuals: 149 BIC: -905.4
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -77.3071 36.153 -2.138 0.034 -148.747 -5.867
total_cpg_no_filter 9.438e-08 2.42e-08 3.900 0.000 4.66e-08 1.42e-07
bsRate_mean 77.1685 36.254 2.129 0.035 5.530 148.807
avgReadCpG_mean 0.0444 0.025 1.808 0.073 -0.004 0.093
Omnibus: 5.115 Durbin-Watson: 1.824
Prob(Omnibus): 0.077 Jarque-Bera (JB): 3.143
Skew: 0.152 Prob(JB): 0.208
Kurtosis: 2.367 Cond. No. 1.55e+10

In [67]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[67]:
<matplotlib.text.Text at 0x10faad550>

In [68]:
print("methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. avgReadCpG_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[68]:
<matplotlib.text.Text at 0x10fc3aba8>

In [69]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[69]:
<matplotlib.text.Text at 0x10ff73ac8>

In [ ]:


In [70]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4b.methylation_difference # dependent variable to predict

X = pairs4b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


17
(136, 14)
(136,)
(136, 3)
Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[70]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.338
Model: OLS Adj. R-squared: 0.323
Method: Least Squares F-statistic: 22.51
Date: Tue, 09 Aug 2016 Prob (F-statistic): 7.84e-12
Time: 18:36:50 Log-Likelihood: 352.25
No. Observations: 136 AIC: -696.5
Df Residuals: 132 BIC: -684.8
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -15.1604 48.621 -0.312 0.756 -111.338 81.018
total_cpg_no_filter 2.116e-07 4.21e-08 5.030 0.000 1.28e-07 2.95e-07
bsRate_mean 13.4284 48.697 0.276 0.783 -82.899 109.756
avgReadCpG_mean 0.2490 0.032 7.896 0.000 0.187 0.311
Omnibus: 7.225 Durbin-Watson: 1.193
Prob(Omnibus): 0.027 Jarque-Bera (JB): 7.374
Skew: 0.570 Prob(JB): 0.0250
Kurtosis: 2.970 Cond. No. 1.28e+10

In [71]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[71]:
<matplotlib.text.Text at 0x110180748>

In [72]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[72]:
<matplotlib.text.Text at 0x1104b8e10>

In [73]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[73]:
<matplotlib.text.Text at 0x110808048>

In [ ]:


In [74]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation_unweighted, mcell.methylation_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs4c.methylation_difference # dependent variable to predict

X = pairs4c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[74]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.211
Model: OLS Adj. R-squared: 0.199
Method: Least Squares F-statistic: 18.34
Date: Tue, 09 Aug 2016 Prob (F-statistic): 1.38e-10
Time: 18:36:53 Log-Likelihood: 526.72
No. Observations: 210 AIC: -1045.
Df Residuals: 206 BIC: -1032.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -191.6946 87.932 -2.180 0.030 -365.057 -18.332
total_cpg_no_filter 1.184e-07 3.26e-08 3.636 0.000 5.42e-08 1.83e-07
bsRate_mean 190.9061 88.069 2.168 0.031 17.274 364.539
avgReadCpG_mean 0.1767 0.026 6.728 0.000 0.125 0.228
Omnibus: 3.789 Durbin-Watson: 1.935
Prob(Omnibus): 0.150 Jarque-Bera (JB): 3.842
Skew: 0.323 Prob(JB): 0.146
Kurtosis: 2.852 Cond. No. 3.13e+10

In [75]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[75]:
<matplotlib.text.Text at 0x110b76e80>

In [76]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[76]:
<matplotlib.text.Text at 0x110d6b4e0>

In [77]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[77]:
<matplotlib.text.Text at 0x110fc5198>

In [ ]:


In [78]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation_unweighted, CD19cell.methylation_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5.methylation_difference # dependent variable to predict

X = pairs5.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


21
(210, 14)
(210,)
(210, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[78]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.085
Model: OLS Adj. R-squared: 0.071
Method: Least Squares F-statistic: 6.342
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000392
Time: 18:36:57 Log-Likelihood: 455.25
No. Observations: 210 AIC: -902.5
Df Residuals: 206 BIC: -889.1
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -133.5188 45.938 -2.906 0.004 -224.088 -42.950
total_cpg_no_filter -3.3e-08 1.15e-08 -2.877 0.004 -5.56e-08 -1.04e-08
bsRate_mean 133.6853 46.060 2.902 0.004 42.875 224.495
avgReadCpG_mean 0.0502 0.031 1.612 0.108 -0.011 0.112
Omnibus: 15.389 Durbin-Watson: 2.014
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11.234
Skew: 0.453 Prob(JB): 0.00363
Kurtosis: 2.319 Cond. No. 1.48e+10

In [79]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[79]:
<matplotlib.text.Text at 0x111353320>

In [80]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[80]:
<matplotlib.text.Text at 0x111569cf8>

In [81]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell1_22_
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[81]:
<matplotlib.text.Text at 0x1112f3710>

In [ ]:


In [82]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation_unweighted, CD19cell.methylation_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5a.methylation_difference # dependent variable to predict

X = pairs5a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[82]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.088
Model: OLS Adj. R-squared: 0.074
Method: Least Squares F-statistic: 6.000
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.000634
Time: 18:37:01 Log-Likelihood: 456.82
No. Observations: 190 AIC: -905.6
Df Residuals: 186 BIC: -892.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -59.8725 24.643 -2.430 0.016 -108.489 -11.256
total_cpg_no_filter -1.809e-08 7.21e-09 -2.508 0.013 -3.23e-08 -3.86e-09
bsRate_mean 59.9897 24.743 2.425 0.016 11.177 108.802
avgReadCpG_mean 0.0187 0.027 0.702 0.484 -0.034 0.071
Omnibus: 8.199 Durbin-Watson: 1.497
Prob(Omnibus): 0.017 Jarque-Bera (JB): 6.238
Skew: 0.333 Prob(JB): 0.0442
Kurtosis: 2.413 Cond. No. 1.63e+10

In [83]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[83]:
<matplotlib.text.Text at 0x111b740f0>

In [84]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[84]:
<matplotlib.text.Text at 0x111cfc7f0>

In [85]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell23_44
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[85]:
<matplotlib.text.Text at 0x11202fe10>

In [ ]:


In [86]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation_unweighted, CD19cell.methylation_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5b.methylation_difference # dependent variable to predict

X = pairs5b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


22
(231, 14)
(231,)
(231, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[86]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.115
Model: OLS Adj. R-squared: 0.103
Method: Least Squares F-statistic: 9.808
Date: Tue, 09 Aug 2016 Prob (F-statistic): 4.15e-06
Time: 18:37:05 Log-Likelihood: 563.60
No. Observations: 231 AIC: -1119.
Df Residuals: 227 BIC: -1105.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 74.2948 55.671 1.335 0.183 -35.404 183.993
total_cpg_no_filter -6.086e-09 8.37e-09 -0.727 0.468 -2.26e-08 1.04e-08
bsRate_mean -73.7124 55.769 -1.322 0.188 -183.604 36.179
avgReadCpG_mean -0.1263 0.032 -3.955 0.000 -0.189 -0.063
Omnibus: 6.820 Durbin-Watson: 2.009
Prob(Omnibus): 0.033 Jarque-Bera (JB): 6.172
Skew: 0.334 Prob(JB): 0.0457
Kurtosis: 2.559 Cond. No. 2.98e+10

In [87]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[87]:
<matplotlib.text.Text at 0x11239eb00>

In [88]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[88]:
<matplotlib.text.Text at 0x11251ba90>

In [89]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'RRBS_NormalBCD19pcell45_66
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[89]:
<matplotlib.text.Text at 0x112905860>

In [ ]:


In [90]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation_unweighted, CD19cell.methylation_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs5c.methylation_difference # dependent variable to predict

X = pairs5c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[90]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.028
Model: OLS Adj. R-squared: 0.012
Method: Least Squares F-statistic: 1.757
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.157
Time: 18:37:08 Log-Likelihood: 417.12
No. Observations: 190 AIC: -826.2
Df Residuals: 186 BIC: -813.2
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -9.9416 55.282 -0.180 0.857 -119.002 99.118
total_cpg_no_filter -2.338e-08 1.06e-08 -2.201 0.029 -4.43e-08 -2.42e-09
bsRate_mean 9.9682 55.417 0.180 0.857 -99.358 119.295
avgReadCpG_mean 0.0075 0.027 0.274 0.784 -0.046 0.061
Omnibus: 11.636 Durbin-Watson: 1.674
Prob(Omnibus): 0.003 Jarque-Bera (JB): 9.758
Skew: 0.469 Prob(JB): 0.00761
Kurtosis: 2.405 Cond. No. 1.62e+10

In [91]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. bsRate, jointplot,  Normal 'RRBS_NormalBCD19pcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[91]:
<matplotlib.text.Text at 0x112cb2860>

In [92]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[92]:
<matplotlib.text.Text at 0x112df2be0>

In [93]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'NormalBCD19pCD27mcell67_88")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'NormalBCD19pCD27mcell67_88
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[93]:
<matplotlib.text.Text at 0x1131064a8>

In [ ]:


In [94]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6.methylation_difference # dependent variable to predict

X = pairs6.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[94]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.079
Model: OLS Adj. R-squared: 0.062
Method: Least Squares F-statistic: 4.745
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00334
Time: 18:37:12 Log-Likelihood: 409.11
No. Observations: 171 AIC: -810.2
Df Residuals: 167 BIC: -797.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 6.1254 6.155 0.995 0.321 -6.027 18.278
total_cpg_no_filter 1.388e-07 6.64e-08 2.090 0.038 7.71e-09 2.7e-07
bsRate_mean -5.6323 6.414 -0.878 0.381 -18.295 7.030
avgReadCpG_mean -0.1044 0.038 -2.735 0.007 -0.180 -0.029
Omnibus: 8.741 Durbin-Watson: 1.701
Prob(Omnibus): 0.013 Jarque-Bera (JB): 9.121
Skew: 0.565 Prob(JB): 0.0105
Kurtosis: 2.968 Cond. No. 9.65e+08

In [95]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[95]:
<matplotlib.text.Text at 0x113374668>

In [96]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[96]:
<matplotlib.text.Text at 0x11351b1d0>

In [97]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_A1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_A1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[97]:
<matplotlib.text.Text at 0x1138195f8>

In [ ]:


In [98]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, methylation_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6a.methylation_difference # dependent variable to predict

X = pairs6a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


20
(190, 14)
(190,)
(190, 3)
Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[98]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.014
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.8640
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.461
Time: 18:37:16 Log-Likelihood: 379.30
No. Observations: 190 AIC: -750.6
Df Residuals: 186 BIC: -737.6
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -15.2251 19.973 -0.762 0.447 -54.627 24.177
total_cpg_no_filter -1.934e-08 2.74e-08 -0.706 0.481 -7.34e-08 3.47e-08
bsRate_mean 16.4483 20.978 0.784 0.434 -24.937 57.834
avgReadCpG_mean -0.0751 0.047 -1.593 0.113 -0.168 0.018
Omnibus: 15.484 Durbin-Watson: 1.625
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11.171
Skew: 0.477 Prob(JB): 0.00375
Kurtosis: 2.293 Cond. No. 4.47e+09

In [99]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[99]:
<matplotlib.text.Text at 0x113c4c780>

In [100]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[100]:
<matplotlib.text.Text at 0x113f1dc50>

In [101]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_B1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_B1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[101]:
<matplotlib.text.Text at 0x11414c780>

In [ ]:


In [102]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, methylation_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6b.methylation_difference # dependent variable to predict

X = pairs6b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[102]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.016
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.9091
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.438
Time: 18:37:19 Log-Likelihood: 347.21
No. Observations: 171 AIC: -686.4
Df Residuals: 167 BIC: -673.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 3.3068 19.272 0.172 0.864 -34.742 41.355
total_cpg_no_filter -3.097e-08 2.17e-08 -1.430 0.155 -7.37e-08 1.18e-08
bsRate_mean -3.4525 20.146 -0.171 0.864 -43.227 36.322
avgReadCpG_mean 0.0115 0.064 0.180 0.857 -0.114 0.137
Omnibus: 16.680 Durbin-Watson: 1.074
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.926
Skew: 0.484 Prob(JB): 0.00424
Kurtosis: 2.227 Cond. No. 7.17e+09

In [103]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[103]:
<matplotlib.text.Text at 0x1143a2eb8>

In [104]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[104]:
<matplotlib.text.Text at 0x1146b0c18>

In [105]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_C1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_C1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[105]:
<matplotlib.text.Text at 0x1149bf0b8>

In [ ]:


In [106]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, methylation_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6c.methylation_difference # dependent variable to predict

X = pairs6c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[106]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.042
Model: OLS Adj. R-squared: 0.025
Method: Least Squares F-statistic: 2.434
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0667
Time: 18:37:22 Log-Likelihood: 386.53
No. Observations: 171 AIC: -765.1
Df Residuals: 167 BIC: -752.5
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 23.3470 11.540 2.023 0.045 0.564 46.130
total_cpg_no_filter 1.458e-09 1.49e-08 0.098 0.922 -2.79e-08 3.09e-08
bsRate_mean -23.9715 12.108 -1.980 0.049 -47.876 -0.067
avgReadCpG_mean -0.0365 0.045 -0.801 0.424 -0.126 0.053
Omnibus: 9.038 Durbin-Watson: 2.014
Prob(Omnibus): 0.011 Jarque-Bera (JB): 8.289
Skew: 0.473 Prob(JB): 0.0158
Kurtosis: 2.483 Cond. No. 5.35e+09

In [107]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[107]:
<matplotlib.text.Text at 0x114bf40b8>

In [108]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[108]:
<matplotlib.text.Text at 0x114e5e668>

In [109]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_D1_24")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_D1_24
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[109]:
<matplotlib.text.Text at 0x115227048>

In [ ]:


In [110]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, methylation_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6d.methylation_difference # dependent variable to predict

X = pairs6d.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


19
(171, 14)
(171,)
(171, 3)
Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[110]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.080
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 4.840
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.00296
Time: 18:37:26 Log-Likelihood: 344.41
No. Observations: 171 AIC: -680.8
Df Residuals: 167 BIC: -668.2
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const -2.8663 13.935 -0.206 0.837 -30.377 24.645
total_cpg_no_filter 2.797e-08 2.88e-08 0.971 0.333 -2.89e-08 8.49e-08
bsRate_mean 1.3256 14.658 0.090 0.928 -27.613 30.264
avgReadCpG_mean 0.2310 0.065 3.533 0.001 0.102 0.360
Omnibus: 7.794 Durbin-Watson: 1.717
Prob(Omnibus): 0.020 Jarque-Bera (JB): 8.185
Skew: 0.527 Prob(JB): 0.0167
Kurtosis: 2.802 Cond. No. 4.19e+09

In [111]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[111]:
<matplotlib.text.Text at 0x1155654a8>

In [112]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[112]:
<matplotlib.text.Text at 0x11579bdd8>

In [113]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_G1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_G1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[113]:
<matplotlib.text.Text at 0x1136038d0>

In [ ]:


In [ ]:


In [ ]:


In [114]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation_unweighted, normb.methylation_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, methylation_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})

y = pairs6e.methylation_difference # dependent variable to predict

X = pairs6e.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()


15
(105, 14)
(105,)
(105, 3)
Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation_unweighted

Variates used are Index(['const', 'total_cpg_no_filter', 'bsRate_mean', 'avgReadCpG_mean'], dtype='object')

Out[114]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.091
Model: OLS Adj. R-squared: 0.064
Method: Least Squares F-statistic: 3.371
Date: Tue, 09 Aug 2016 Prob (F-statistic): 0.0214
Time: 18:37:30 Log-Likelihood: 215.78
No. Observations: 105 AIC: -423.6
Df Residuals: 101 BIC: -412.9
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 32.8471 28.045 1.171 0.244 -22.786 88.480
total_cpg_no_filter 1.254e-09 4.09e-08 0.031 0.976 -7.99e-08 8.24e-08
bsRate_mean -31.5403 28.913 -1.091 0.278 -88.895 25.814
avgReadCpG_mean -0.3575 0.115 -3.100 0.003 -0.586 -0.129
Omnibus: 3.268 Durbin-Watson: 1.641
Prob(Omnibus): 0.195 Jarque-Bera (JB): 3.254
Skew: 0.418 Prob(JB): 0.197
Kurtosis: 2.787 Cond. No. 6.03e+09

In [115]:
print("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. bsRate, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[115]:
<matplotlib.text.Text at 0x115c61940>

In [116]:
print("methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")


methylation_difference vs. avgReadCpGs, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[116]:
<matplotlib.text.Text at 0x115ed7898>

In [117]:
print("methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot,  Normal 'normal_B_cell_H1_22")


methylation_difference vs. total unique CpGs per cell, jointplot,  Normal 'normal_B_cell_H1_22
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[117]:
<matplotlib.text.Text at 0x116117588>

In [118]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)


(210, 14)
(210, 14)
(190, 14)
(190, 14)
(190, 14)
(136, 14)
(231, 14)
(36, 14)
(190, 14)
(153, 14)
(153, 14)
(136, 14)
(210, 14)
(210, 14)
(190, 14)
(231, 14)
(190, 14)
(171, 14)
(190, 14)
(171, 14)
(171, 14)
(171, 14)
(105, 14)

In [119]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')


frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c, 
          pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

In [120]:
total_pairs = pd.concat(frames)

In [121]:
total_pairs.shape


Out[121]:
(4035, 15)

In [122]:
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)

In [123]:
categorical_variables = ['type']
for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)
    
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta _unweightedmethylation")
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta _unweightedmethylation
Out[123]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.162
Model: OLS Adj. R-squared: 0.161
Method: Least Squares F-statistic: 194.6
Date: Tue, 09 Aug 2016 Prob (F-statistic): 9.20e-153
Time: 18:37:33 Log-Likelihood: 9340.4
No. Observations: 4035 AIC: -1.867e+04
Df Residuals: 4030 BIC: -1.864e+04
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.5248 0.029 18.140 0.000 0.468 0.582
total_cpg_no_filter -1.36e-08 1.98e-09 -6.863 0.000 -1.75e-08 -9.72e-09
bsRate_mean -0.4601 0.027 -16.997 0.000 -0.513 -0.407
avgReadCpG_mean -0.0047 0.001 -7.117 0.000 -0.006 -0.003
type_CLL -0.0220 0.001 -21.264 0.000 -0.024 -0.020
Omnibus: 475.164 Durbin-Watson: 1.566
Prob(Omnibus): 0.000 Jarque-Bera (JB): 679.256
Skew: 0.896 Prob(JB): 3.17e-148
Kurtosis: 3.911 Cond. No. 5.10e+07

In [124]:
X.head()


Out[124]:
const total_cpg_no_filter bsRate_mean avgReadCpG_mean type_CLL
0 1 709541.5 0.980181 7.029760 1.0
1 1 786284.0 0.980293 7.017405 1.0
2 1 683623.5 0.980268 7.039100 1.0
3 1 853200.5 0.980262 7.028117 1.0
4 1 842686.5 0.980228 7.025451 1.0

In [125]:
print("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")


methylation_difference vs. bsRate, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[125]:
<matplotlib.text.Text at 0x1163a16a0>

In [126]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[126]:
<matplotlib.text.Text at 0x116703358>

In [127]:
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[127]:
<matplotlib.text.Text at 0x116a2a6a0>

In [128]:
print("methylation_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. bsRate, by type, CLL vs normal")


methylation_difference vs. bsRate, by type! 
Out[128]:
<matplotlib.text.Text at 0x116d40c18>

In [129]:
print("methylation_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")


methylation_difference vs. avgReadCpG per cell, by type! 
Out[129]:
<matplotlib.text.Text at 0x116e79860>

In [130]:
print("methylation_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, hue='type')
plt.title("methylation_difference vs. total unique CpG per cell, by type, CLL vs Normal ")


methylation_difference vs. total unique CpG per cel, by type!
Out[130]:
<matplotlib.text.Text at 0x11704ed30>

In [131]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:11: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.895796010879

In [132]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]

normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c, 
                 pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]

cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)

normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)


(990, 15)
(3045, 15)

In [133]:
#
# CLL first
#
y = cll_pairs.methylation_difference # dependent variable
X = cll_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation_unweighted")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation_unweighted
(990, 4)
Out[133]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.339
Model: OLS Adj. R-squared: 0.337
Method: Least Squares F-statistic: 168.7
Date: Tue, 09 Aug 2016 Prob (F-statistic): 3.00e-88
Time: 18:40:05 Log-Likelihood: 2891.0
No. Observations: 990 AIC: -5774.
Df Residuals: 986 BIC: -5754.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.0179 0.088 0.204 0.839 -0.154 0.190
total_cpg_no_filter -3.489e-08 3.53e-09 -9.897 0.000 -4.18e-08 -2.8e-08
bsRate_mean -0.3324 0.080 -4.164 0.000 -0.489 -0.176
avgReadCpG_mean 0.0481 0.006 8.480 0.000 0.037 0.059
Omnibus: 132.065 Durbin-Watson: 1.579
Prob(Omnibus): 0.000 Jarque-Bera (JB): 235.039
Skew: 0.841 Prob(JB): 9.16e-52
Kurtosis: 4.694 Cond. No. 1.66e+08

In [134]:
print("methylation_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot,  CLL pairs ")


methylation_difference vs. bsRate, jointplot, CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[134]:
<matplotlib.text.Text at 0x1168d54e0>

In [135]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot,  CLL pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[135]:
<matplotlib.text.Text at 0x2294afe10>

In [136]:
print("methylation diference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell,  CLL pairs")


methylation diference vs. total # unique CpGs per cell, CLL pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[136]:
<matplotlib.text.Text at 0x2299db128>

In [137]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.923875725211

In [ ]:


In [138]:
#
# normal
#
y = normal_pairs.methylation_difference # dependent variable
X = normal_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
               "total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)

X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation_unweighted")
print(X.shape)
est.summary()


Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation_unweighted
(3045, 4)
Out[138]:
OLS Regression Results
Dep. Variable: methylation_difference R-squared: 0.054
Model: OLS Adj. R-squared: 0.053
Method: Least Squares F-statistic: 57.37
Date: Tue, 09 Aug 2016 Prob (F-statistic): 4.51e-36
Time: 18:40:45 Log-Likelihood: 6756.0
No. Observations: 3045 AIC: -1.350e+04
Df Residuals: 3041 BIC: -1.348e+04
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 0.4804 0.035 13.882 0.000 0.413 0.548
total_cpg_no_filter -7.944e-09 2.56e-09 -3.106 0.002 -1.3e-08 -2.93e-09
bsRate_mean -0.4206 0.032 -13.091 0.000 -0.484 -0.358
avgReadCpG_mean -0.0042 0.001 -5.583 0.000 -0.006 -0.003
Omnibus: 287.266 Durbin-Watson: 1.589
Prob(Omnibus): 0.000 Jarque-Bera (JB): 371.279
Skew: 0.835 Prob(JB): 2.39e-81
Kurtosis: 3.372 Cond. No. 4.39e+07

In [139]:
print("methylation_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal pairs ")


methylation_difference vs. bsRate, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[139]:
<matplotlib.text.Text at 0x117759f60>

In [140]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")


methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs 
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[140]:
<matplotlib.text.Text at 0x117b7c9b0>

In [141]:
print("methylation diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference",  data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, Normal pairs")


methylation diference vs. total # unique CpGs per cell, Normal pairs
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[141]:
<matplotlib.text.Text at 0x1180a1d68>

In [142]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability, 
model.fit(X, y)                                                                    # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:8: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
Random Forest model score is 0.880060844756

In [ ]:


In [ ]:


In [ ]: