In [1]:
%matplotlib inline
In [2]:
#
# Covariates are:
# - Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# - BS rate per cell
# - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs),
# the covariates are the mean between the two pairs.
#
In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
import statsmodels.api as sm
In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")
In [5]:
stats.shape
Out[5]:
In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]
In [7]:
len(normal)
Out[7]:
In [8]:
len(CLL)
Out[8]:
In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")
In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)
In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])
In [12]:
cpg_total.shape
Out[12]:
In [13]:
merged = stats.merge(cpg_total, on="filename")
In [14]:
merged.shape
Out[14]:
In [15]:
merged.head()
Out[15]:
In [16]:
sns.lmplot(x="bsRate", y="methylation", data=merged)
plt.title("methylation (weighted) vs bisulfite conversion rate")
Out[16]:
In [17]:
sns.lmplot(x="bsRate", y="methylation_unweighted", data=merged)
plt.title("methylation (unweighted) vs bisulfite conversion rate")
Out[17]:
In [18]:
sns.lmplot(x="bsRate", y="PDR_total", data=merged)
plt.title("PDR (weighted) vs bisulfite conversion rate")
Out[18]:
In [19]:
sns.lmplot(x="bsRate", y="PDR_unweighted", data=merged)
plt.title("PDR (unweighted) vs bisulfite conversion rate")
Out[19]:
In [20]:
sns.lmplot(x="bsRate", y="total_cpg_no_filter", data=merged)
plt.title("Total # unique CpGs per cell (no filter) vs bisulfite conversion rate")
Out[20]:
In [21]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan1", data=merged)
plt.title("Total # unique CpGs per cell (filter: > 1) vs bisulfite conversion rate")
Out[21]:
In [22]:
sns.lmplot(x="bsRate", y="total_cpg_gtrthan38", data=merged)
plt.title("Total # unique CpGs per cell (filter: >= 3.8) vs bisulfite conversion rate")
Out[22]:
In [23]:
sns.lmplot(x="bsRate", y="avgReadCpgs_nofilter", data=merged)
plt.title("Mean avgCpG read per cell (no filter) vs bisulfite conversion rate")
Out[23]:
In [24]:
sns.lmplot(x="bsRate", y="avgReadCpgs_lessthan1CpG", data=merged)
plt.title("Mean avgCpG read per cell (filter: > 1) vs bisulfite conversion rate")
Out[24]:
In [25]:
sns.lmplot(x="bsRate", y="avgReadCpgs_gtreql3.8CpG", data=merged)
plt.title("Mean avgCpG read per cell (filter: >= 3.8) vs bisulfite conversion rate")
Out[25]:
In [ ]:
In [ ]:
In [26]:
tritopool = merged[merged["protocol"] == 'trito_pool_1'] # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc) # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs1.methylation_difference # dependent variable to predict
X = pairs1.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[26]:
In [27]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs1, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
Out[27]:
In [28]:
print("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[28]:
In [29]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs1, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[29]:
In [30]:
tritopool = merged[merged["protocol"] == 'trito_pool_2'] # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc) # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs1a.methylation_difference # dependent variable to predict
X = pairs1a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[30]:
In [31]:
print("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
Out[31]:
In [32]:
print("methylation_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[32]:
In [33]:
print("methylation_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[33]:
In [ ]:
In [34]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs2.methylation_difference # dependent variable to predict
X = pairs2.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[34]:
In [35]:
print("methylation_difference vs. bsRate, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_Tris_protease")
Out[35]:
In [36]:
print("methylation_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")
Out[36]:
In [37]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")
Out[37]:
In [38]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs2a.methylation_difference # dependent variable to predict
X = pairs2a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[38]:
In [39]:
print("methylation_difference vs. bsRate, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[39]:
In [40]:
print("methylation_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[40]:
In [41]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[41]:
In [ ]:
In [ ]:
In [42]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs2b.methylation_difference # dependent variable to predict
X = pairs2b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[42]:
In [43]:
print("methylation_difference vs. bsRate, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[43]:
In [44]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[44]:
In [45]:
print("methylation_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[45]:
In [ ]:
In [46]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs3.methylation_difference # dependent variable to predict
X = pairs3.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[46]:
In [47]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[47]:
In [48]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[48]:
In [49]:
print("methylation_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[49]:
In [ ]:
In [50]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs3a.methylation_difference # dependent variable to predict
X = pairs3a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[50]:
In [51]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[51]:
In [52]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[52]:
In [53]:
print("methylation_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[53]:
In [ ]:
In [54]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs3b.methylation_difference # dependent variable to predict
X = pairs3b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[54]:
In [55]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[55]:
In [56]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[56]:
In [57]:
print("methylation_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[57]:
In [ ]:
In [ ]:
In [58]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs3c.methylation_difference # dependent variable to predict
X = pairs3c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[58]:
In [59]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[59]:
In [60]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[60]:
In [61]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[61]:
In [ ]:
In [ ]:
In [62]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs4.methylation_difference # dependent variable to predict
X = pairs4.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[62]:
In [63]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[63]:
In [64]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[64]:
In [65]:
print("methylation_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[65]:
In [ ]:
In [66]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs4a.methylation_difference # dependent variable to predict
X = pairs4a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[66]:
In [67]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[67]:
In [68]:
print("methylation_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[68]:
In [69]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[69]:
In [ ]:
In [70]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs4b.methylation_difference # dependent variable to predict
X = pairs4b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[70]:
In [71]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[71]:
In [72]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[72]:
In [73]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[73]:
In [ ]:
In [74]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs4c.methylation_difference # dependent variable to predict
X = pairs4c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[74]:
In [75]:
print("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[75]:
In [76]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[76]:
In [77]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[77]:
In [ ]:
In [78]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs5.methylation_difference # dependent variable to predict
X = pairs5.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[78]:
In [79]:
print("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[79]:
In [80]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[80]:
In [81]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[81]:
In [ ]:
In [82]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs5a.methylation_difference # dependent variable to predict
X = pairs5a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[82]:
In [83]:
print("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[83]:
In [84]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[84]:
In [85]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[85]:
In [ ]:
In [86]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs5b.methylation_difference # dependent variable to predict
X = pairs5b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[86]:
In [87]:
print("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[87]:
In [88]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[88]:
In [89]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[89]:
In [ ]:
In [90]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.methylation, CD19cell.methylation)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs5c.methylation_difference # dependent variable to predict
X = pairs5c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[90]:
In [91]:
print("methylation_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[91]:
In [92]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[92]:
In [93]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[93]:
In [ ]:
In [94]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6.methylation_difference # dependent variable to predict
X = pairs6.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[94]:
In [95]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_A1_24")
Out[95]:
In [96]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")
Out[96]:
In [97]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")
Out[97]:
In [ ]:
In [98]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, methylation_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6a.methylation_difference # dependent variable to predict
X = pairs6a.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[98]:
In [99]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_B1_24")
Out[99]:
In [100]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")
Out[100]:
In [101]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")
Out[101]:
In [ ]:
In [102]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, methylation_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6b.methylation_difference # dependent variable to predict
X = pairs6b.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[102]:
In [103]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_C1_24")
Out[103]:
In [104]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")
Out[104]:
In [105]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")
Out[105]:
In [ ]:
In [106]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, methylation_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6c.methylation_difference # dependent variable to predict
X = pairs6c.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[106]:
In [107]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_D1_24")
Out[107]:
In [108]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")
Out[108]:
In [109]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")
Out[109]:
In [ ]:
In [110]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, methylation_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6d.methylation_difference # dependent variable to predict
X = pairs6d.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[110]:
In [111]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_G1_22")
Out[111]:
In [112]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")
Out[112]:
In [113]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")
Out[113]:
In [ ]:
In [ ]:
In [ ]:
In [114]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, methylation_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_nofilter":"avgReadCpG_mean"})
y = pairs6e.methylation_difference # dependent variable to predict
X = pairs6e.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta methylation")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[114]:
In [115]:
print("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs6e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal 'normal_B_cell_H1_22")
Out[115]:
In [116]:
print("methylation_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")
Out[116]:
In [117]:
print("methylation_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=pairs6e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")
Out[117]:
In [118]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)
In [119]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')
frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c,
pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]
In [120]:
total_pairs = pd.concat(frames)
In [121]:
total_pairs.shape
Out[121]:
In [122]:
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG"], axis=1)
In [123]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()
Out[123]:
In [124]:
X.head()
Out[124]:
In [125]:
print("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
Out[125]:
In [126]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")
Out[126]:
In [127]:
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")
Out[127]:
In [128]:
print("methylation_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, hue='type')
plt.title("methylation_difference vs. bsRate, by type, CLL vs normal")
Out[128]:
In [129]:
print("methylation_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="methylation_difference", data=total_pairs, hue='type')
plt.title("methylation_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")
Out[129]:
In [130]:
print("methylation_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="methylation_difference", data=total_pairs, hue='type')
plt.title("methylation_difference vs. total unique CpG per cell, by type, CLL vs Normal ")
Out[130]:
In [131]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [132]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]
normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c,
pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]
cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)
normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)
In [133]:
#
# CLL first
#
y = cll_pairs.methylation_difference # dependent variable
X = cll_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()
Out[133]:
In [134]:
print("methylation_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, CLL pairs ")
Out[134]:
In [135]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")
Out[135]:
In [136]:
print("methylation diference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, CLL pairs")
Out[136]:
In [137]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [ ]:
In [138]:
#
# normal
#
y = normal_pairs.methylation_difference # dependent variable
X = normal_pairs.drop(["methylation_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_lessthan1CpG", "avgReadCpgs_gtreql3.8CpG", "type"], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
print(X.shape)
est.summary()
Out[138]:
In [139]:
print("methylation_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, Normal pairs ")
Out[139]:
In [140]:
print("methylation_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")
Out[140]:
In [141]:
print("methylation diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. total # Unique_CpGs per cell, Normal pairs")
Out[141]:
In [142]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [ ]:
In [ ]:
In [ ]: