In [1]:
%matplotlib inline
In [2]:
#
# We need a regression model for predicting methylation (highest priority):
# Variables are:
# - Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# - BS rate per cell
# - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting methylation?
In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.api as sm
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
In [4]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1_ALL.csv")
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell_ALL.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normalmcell_ALL.csv")
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_ALL.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C_ALL.csv")
In [5]:
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)
normal_cellA_df["type"] = str('normal')
normal_cellA_df["bio"] = str('normal_B')
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]
In [6]:
normal_cellA_df.head()
Out[6]:
In [7]:
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)
cpg1["filename"] = cpg1["filename"].str[:40]
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')
normal_cellB_df["type"] = str('normal')
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)
normal_cellB_df["bio"] = str('CD27p')
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')
cpg2 = pd.read_csv('NormalBCD19pCD27pcell_CpGs.csv')
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)
cpg2["filename"] = cpg2["filename"].str[:50]
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)
normal_cellC_df["type"] = str('normal')
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]
normal_cellC_df["bio"] = str('CD27m')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')
cpg3 = pd.read_csv("NormalBCD19pCD27mcell_CpGs.csv")
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)
cpg3["filename"] = cpg3["filename"].str[:50]
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
print(normal_result.shape)
print(normal_result.columns)
In [8]:
normal_result = normal_result[['filename', 'methylation', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
In [9]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)
cll_cellA_df["type"] = str('CLL')
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] !=
'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
cll_cellA_df["bio"] = str('CLL')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)
cpg4["filename"] = cpg4["filename"].str[:51]
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)
cll_cellC_df["type"] = str('CLL')
cll_cellC_df["bio"] = str('CLL')
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1)
cpg5["filename"] = cpg5["filename"].str[:33]
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
print(cll_result.shape)
print(cll_result.columns)
In [10]:
cll_result = cll_result[['filename', 'methylation', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
cll_result = cll_result.reset_index(drop=True)
normal_result = normal_result.reset_index(drop=True)
combined2 = normal_result.append(cll_result)
combined2 = combined2.reset_index(drop=True)
combined2.head()
Out[10]:
In [11]:
bs = pd.read_table('allStats.txt')
bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)
bs = bs.reset_index(drop=True)
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)
bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)
bs = bs.rename(columns = {'sample':'filename'})
In [12]:
merged = pd.merge(combined2, bs, how='inner')
merged = merged.reset_index(drop=True)
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})
# Remove all data points with less than 100k in totcpg
merged = merged[merged['total_reads'] > 100000]
scattermatrix1 = merged.drop(['filename', 'total_reads', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)
sns.lmplot(x="bsRate", y="methylation", data=scattermatrix1)
plt.title("methylation vs bisulfite conversion rate")
Out[12]:
In [13]:
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)
y = scattermatrix7.methylation # dependent variable
print(y.shape)
X = scattermatrix7.drop(['methylation', 'total_reads', 'protocol'], axis=1)
print(X.shape)
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
import statsmodels.api as sm
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict methylation")
print("Variables: Number of unique CpGs per cell, mean Average Read CpG per cell, BS rate, Cll or Normal B")
est.summary()
Out[13]:
In [14]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2))
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.methylation, tritopool.methylation)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs = pd.merge(out, methylation_differences, how='inner')
print(pairs.shape)
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs.methylation_difference # dependent variable
X = pairs.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta methylation")
est.summary()
Out[14]:
In [15]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
Out[15]:
In [16]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[16]:
In [17]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[17]:
In [18]:
tritopool2 = merged[merged["protocol"] == 'trito_pool_2']
tritopool2 = tritopool2.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
tritopool2A = tritopool2.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool2.filename,2))
out = pd.DataFrame([tritopool2A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool2.methylation, tritopool2.methylation)), tritopool2.filename, tritopool2.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, methylation_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs1a.methylation_difference # dependent variable
X = pairs1a.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool_2', predict \delta methylation")
est.summary()
Out[18]:
In [19]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
Out[19]:
In [20]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[20]:
In [21]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs1a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[21]:
In [22]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, methylation_differences, how='inner')
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2.methylation_difference # dependent variable
X = pairs2.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta methylation")
est.summary()
Out[22]:
In [23]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease")
Out[23]:
In [24]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")
Out[24]:
In [25]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs2, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")
Out[25]:
In [26]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, methylation_differences, how='inner')
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2a.methylation_difference # dependent variable
X = pairs2a.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_cw154_Tris_protease_GR', predict \delta methylation")
est.summary()
Out[26]:
In [27]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")
Out[27]:
In [28]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")
Out[28]:
In [29]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs2a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")
Out[29]:
In [30]:
# cw154_CutSmart_proteinase_K
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.methylation, cw154.methylation)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, methylation_differences, how='inner')
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs2b.methylation_difference # dependent variable
X = pairs2b.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta methylation")
est.summary()
Out[30]:
In [31]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, cw154_Tris_protease_GR")
Out[31]:
In [32]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease_GR")
Out[32]:
In [33]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs2b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease_GR")
Out[33]:
In [34]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3.methylation_difference # dependent variable
X = pairs3.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell1_22', predict \delta methylation")
est.summary()
Out[34]:
In [35]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, CD27pcell1_22")
Out[35]:
In [36]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, CD27pcell1_22")
Out[36]:
In [37]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs3, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, CD27pcell1_22")
Out[37]:
In [38]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, methylation_differences, how='inner')
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3a.methylation_difference # dependent variable
X = pairs3a.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell22_34', predict \delta methylation")
est.summary()
Out[38]:
In [39]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, CD27pcell23_44")
Out[39]:
In [40]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, CD27pcell23_44")
Out[40]:
In [41]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs3a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, CD27pcell23_44")
Out[41]:
In [42]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, methylation_differences, how='inner')
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3b.methylation_difference # dependent variable
X = pairs3b.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell45_66', predict \delta methylation")
est.summary()
Out[42]:
In [43]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell45_66")
Out[43]:
In [44]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")
Out[44]:
In [45]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs3b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27pcell45_66")
Out[45]:
In [46]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.methylation, pcell.methylation)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, methylation_differences, how='inner')
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs3c.methylation_difference # dependent variable
X = pairs3c.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal B 'NormalBCD19pCD27pcell67_88', predict \delta methylation")
est.summary()
Out[46]:
In [47]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")
Out[47]:
In [48]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")
Out[48]:
In [49]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs3c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27pcell67_88")
Out[49]:
In [50]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, methylation_differences, how='inner')
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4.methylation_difference # dependent variable
X = pairs4.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell1_22', predict \delta methylation")
est.summary()
Out[50]:
In [51]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell1_22 ")
Out[51]:
In [52]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")
Out[52]:
In [53]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs4, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell1_22")
Out[53]:
In [54]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, methylation_differences, how='inner')
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4a.methylation_difference # dependent variable
X = pairs4a.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()
Out[54]:
In [55]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell23_44 ")
Out[55]:
In [56]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")
Out[56]:
In [57]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs4a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell23_44")
Out[57]:
In [58]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, methylation_differences, how='inner')
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4b.methylation_difference # dependent variable
X = pairs4b.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()
Out[58]:
In [59]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell45_66 ")
Out[59]:
In [60]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")
Out[60]:
In [61]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs4b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell45_66")
Out[61]:
In [62]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.methylation, mcell.methylation)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, methylation_differences, how='inner')
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs4c.methylation_difference # dependent variable
X = pairs4c.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'NormalBCD19pCD27mcell23_44', predict \delta methylation")
est.summary()
Out[62]:
In [63]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, NormalBCD19pCD27mcell67_88 ")
Out[63]:
In [64]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")
Out[64]:
In [65]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs4c, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, NormalBCD19pCD27mcell67_88")
Out[65]:
In [66]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5.methylation_difference # dependent variable
X = pairs5.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_A1_24', predict \delta methylation")
est.summary()
Out[66]:
In [67]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_A1_24")
Out[67]:
In [68]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_A1_24")
Out[68]:
In [69]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_A1_24")
Out[69]:
In [70]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, methylation_differences, how='inner')
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5a.methylation_difference # dependent variable
X = pairs5a.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_B1_24', predict \delta methylation")
est.summary()
Out[70]:
In [71]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_B1_24")
Out[71]:
In [72]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_B1_24")
Out[72]:
In [73]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5a, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_B1_24")
Out[73]:
In [74]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, methylation_differences, how='inner')
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5b.methylation_difference # dependent variable
X = pairs5b.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_C1_24', predict \delta methylation")
est.summary()
Out[74]:
In [75]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_C1_24")
Out[75]:
In [76]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_C1_24")
Out[76]:
In [77]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5b, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_C1_24")
Out[77]:
In [78]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, methylation_differences, how='inner')
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5c.methylation_difference # dependent variable
X = pairs5c.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_D1_24', predict \delta methylation")
est.summary()
Out[78]:
In [79]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_D1_24 ")
Out[79]:
In [80]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_D1_24")
Out[80]:
In [81]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5c, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_D1_24")
Out[81]:
In [82]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5d = pd.merge(out, methylation_differences, how='inner')
pairs5d = pairs5d.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5d.methylation_difference # dependent variable
X = pairs5d.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_G1_22', predict \delta methylation")
est.summary()
Out[82]:
In [83]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5d, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_G1_22 ")
Out[83]:
In [84]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5d, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_G1_22")
Out[84]:
In [85]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5d, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_G1_22")
Out[85]:
In [86]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
normb = normb.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.methylation, normb.methylation)), normb.filename, normb.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5e = pd.merge(out, methylation_differences, how='inner')
pairs5e = pairs5e.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
y = pairs5e.methylation_difference # dependent variable
X = pairs5e.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for 'normal_B_cell_H1_22', predict \delta methylation")
est.summary()
Out[86]:
In [87]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=pairs5e, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, normal_B_cell_H1_22")
Out[87]:
In [88]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=pairs5e, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, normal_B_cell_H1_22")
Out[88]:
In [89]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=pairs5e, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, normal_B_cell_H1_22")
Out[89]:
In [90]:
pairs['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('CLL')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs5d['type'] = str('normal')
pairs5e['type'] = str('normal')
frames22 = [pairs, pairs2, pairs3, pairs4]
total_pairs = pd.concat(frames22)
y = total_pairs.methylation_difference # dependent variable
X = total_pairs.drop(['methylation', 'methylation_difference', 'total_reads_mean', 'filename'], axis=1)
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta methylation")
est.summary()
Out[90]:
In [91]:
sns.jointplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. bsRate, jointplot, both CLL and normal ")
Out[91]:
In [92]:
sns.jointplot(x="avgReadCpGs_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")
Out[92]:
In [93]:
sns.jointplot(x="Unique_CpGs_mean", y="methylation_difference", data=total_pairs, kind="reg")
plt.title("methylation_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")
Out[93]:
In [94]:
sns.lmplot(x="bsRate_mean", y="methylation_difference", data=total_pairs, hue='type')
Out[94]:
In [95]:
sns.lmplot(x="avgReadCpGs_mean", y="methylation_difference", data=total_pairs, hue='type')
Out[95]:
In [96]:
sns.lmplot(x="Unique_CpGs_mean", y="methylation_difference", data=total_pairs, hue='type')
Out[96]:
In [97]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting methylation_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [ ]:
In [ ]: