In [1]:
%matplotlib inline
In [2]:
#
# Covariates are:
# - Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# - BS rate per cell
# - CLL or Normal status per cell
#
# For distances (i.e. PDR difference between pairs, methylation difference between pairs),
# the covariates are the mean between the two pairs.
#
In [3]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
import statsmodels.api as sm
In [4]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")
In [5]:
stats.shape
Out[5]:
In [6]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]
In [7]:
len(normal)
Out[7]:
In [8]:
len(CLL)
Out[8]:
In [9]:
mcell_cpg = pd.read_csv("mcell_avgCpGs.csv")
pcell_cpg = pd.read_csv("pcell_avgCpGs.csv")
CD19cell_cpg = pd.read_csv("CD19_avgCpGs.csv")
normalB_cell_cpg = pd.read_csv("normalB_avgCpGs.csv")
trito_cell_cpg = pd.read_csv("trito_avgCpGs.csv")
cw154_cell_cpg = pd.read_csv("cw154_cpgs.csv")
In [10]:
mcell_cpg = mcell_cpg.drop(["Unnamed: 0"], axis=1)
pcell_cpg = pcell_cpg.drop(["Unnamed: 0"], axis=1)
CD19cell_cpg = CD19cell_cpg.drop(["Unnamed: 0"], axis=1)
normalB_cell_cpg = normalB_cell_cpg.drop(["Unnamed: 0"], axis=1)
trito_cell_cpg = trito_cell_cpg.drop(["Unnamed: 0"], axis=1)
cw154_cell_cpg = cw154_cell_cpg.drop(["Unnamed: 0"], axis=1)
In [11]:
cpg_total = pd.concat([mcell_cpg, pcell_cpg, CD19cell_cpg, normalB_cell_cpg, trito_cell_cpg, cw154_cell_cpg])
In [12]:
cpg_total.shape
Out[12]:
In [13]:
merged = stats.merge(cpg_total, on="filename")
In [14]:
merged.shape
Out[14]:
In [15]:
merged.head()
Out[15]:
In [ ]:
In [ ]:
In [16]:
tritopool = merged[merged["protocol"] == 'trito_pool_1'] # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc) # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, PDR_differences, how='inner')
print(pairs1.shape)
pairs1 = pairs1.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs1.PDR_difference # dependent variable to predict
X = pairs1.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[16]:
In [17]:
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs1, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
Out[17]:
In [18]:
print("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[18]:
In [19]:
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_1")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs1, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[19]:
In [20]:
tritopool = merged[merged["protocol"] == 'trito_pool_2'] # select only "trito_pool_1" files
print(len(tritopool))
tritopool = tritopool.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
tritopoolA = tritopool.set_index("filename")
from itertools import combinations
cc = list(combinations(tritopool.filename,2)) # combines into all pairs
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc) # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR_unweighted, tritopool.PDR_unweighted)), tritopool.filename, tritopool.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1a = pd.merge(out, PDR_differences, how='inner')
print(pairs1a.shape)
pairs1a = pairs1a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs1a.PDR_difference # dependent variable to predict
X = pairs1a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'RRBS_trito_pool1', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[20]:
In [21]:
print("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs1a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_2")
Out[21]:
In [22]:
print("PDR_difference vs. mean avgReadCpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[22]:
In [23]:
print("PDR_difference vs. total unique CpGs, jointplot, RRBS_trito_pool CLL trito_2")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs1a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_2")
Out[23]:
In [ ]:
In [24]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs2.PDR_difference # dependent variable to predict
X = pairs2.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[24]:
In [25]:
print("PDR_difference vs. bsRate, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_Tris_protease")
Out[25]:
In [26]:
print("PDR_difference vs. avgReadCpGs, jointplot, CLL cw154_Tris_protease")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")
Out[26]:
In [27]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease")
Out[27]:
In [28]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease_GR']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2a = pd.merge(out, PDR_differences, how='inner')
print(pairs2a.shape)
pairs2a = pairs2a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs2a.PDR_difference # dependent variable to predict
X = pairs2a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_Tris_protease_GR', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[28]:
In [29]:
print("PDR_difference vs. bsRate, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs2a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[29]:
In [30]:
print("PDR_difference vs. avgReadCpgs, jointplot, CLL cw154_Tris_protease_GR")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[30]:
In [31]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_Tris_protease_GR")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs2a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_Tris_protease_GR")
Out[31]:
In [ ]:
In [ ]:
In [32]:
cw154 = merged[merged["protocol"] == 'cw154_CutSmart_proteinase_K']
print(len(cw154))
cw154 = cw154.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
cw154 = cw154.reset_index(drop=True)
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR_unweighted, cw154.PDR_unweighted)), cw154.filename, cw154.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2b = pd.merge(out, PDR_differences, how='inner')
print(pairs2b.shape)
pairs2b = pairs2b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs2b.PDR_difference # dependent variable to predict
X = pairs2b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for CLL 'cw154_CutSmart_proteinase_K', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[32]:
In [33]:
print("PDR_difference vs. bsRate, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs2b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[33]:
In [34]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[34]:
In [35]:
print("PDR_difference vs. total unique CpGs, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs2b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154 CLL cw154_CutSmart_proteinase_K")
Out[35]:
In [ ]:
In [36]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell1_22_']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, PDR_differences, how='inner')
print(pairs3.shape)
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs3.PDR_difference # dependent variable to predict
X = pairs3.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[36]:
In [37]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[37]:
In [38]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[38]:
In [39]:
print("PDR_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell1_22_")
Out[39]:
In [ ]:
In [40]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3a = pd.merge(out, PDR_differences, how='inner')
print(pairs3a.shape)
pairs3a = pairs3a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs3a.PDR_difference # dependent variable to predict
X = pairs3a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[40]:
In [41]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs3a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[41]:
In [42]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[42]:
In [43]:
print("PDR_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs3a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell23_44")
Out[43]:
In [ ]:
In [44]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell45_66']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3b = pd.merge(out, PDR_differences, how='inner')
print(pairs3b.shape)
pairs3b = pairs3b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs3b.PDR_difference # dependent variable to predict
X = pairs3b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[44]:
In [45]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs3b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[45]:
In [46]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[46]:
In [47]:
print("PDR_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs3b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell45_66")
Out[47]:
In [ ]:
In [ ]:
In [48]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell67_88']
print(len(pcell))
pcell = pcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
pcell = pcell.reset_index(drop=True)
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR_unweighted, pcell.PDR_unweighted)), pcell.filename, pcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3c = pd.merge(out, PDR_differences, how='inner')
print(pairs3c.shape)
pairs3c = pairs3c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs3c.PDR_difference # dependent variable to predict
X = pairs3c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[48]:
In [49]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs3c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[49]:
In [50]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[50]:
In [51]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs3c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27pcell67_88")
Out[51]:
In [ ]:
In [ ]:
In [52]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell1_22_']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs4.PDR_difference # dependent variable to predict
X = pairs4.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[52]:
In [53]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[53]:
In [54]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[54]:
In [55]:
print("PDR_difference vs. total unique CpGs, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell1_22_")
Out[55]:
In [ ]:
In [56]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4a = pd.merge(out, PDR_differences, how='inner')
print(pairs4a.shape)
pairs4a = pairs4a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs4a.PDR_difference # dependent variable to predict
X = pairs4a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[56]:
In [57]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs4a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[57]:
In [58]:
print("PDR_difference vs. avgReadCpG_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[58]:
In [59]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs4a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell23_44")
Out[59]:
In [ ]:
In [60]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell45_66']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4b = pd.merge(out, PDR_differences, how='inner')
print(pairs4b.shape)
pairs4b = pairs4b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs4b.PDR_difference # dependent variable to predict
X = pairs4b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[60]:
In [61]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs4b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[61]:
In [62]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[62]:
In [63]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs4b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell45_66")
Out[63]:
In [ ]:
In [64]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell67_88']
print(len(mcell))
mcell = mcell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
mcell = mcell.reset_index(drop=True)
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR_unweighted, mcell.PDR_unweighted)), mcell.filename, mcell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4c = pd.merge(out, PDR_differences, how='inner')
print(pairs4c.shape)
pairs4c = pairs4c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs4c.PDR_difference # dependent variable to predict
X = pairs4c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'NormalBCD19pCD27mcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[64]:
In [65]:
print("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs4c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[65]:
In [66]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[66]:
In [67]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs4c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[67]:
In [ ]:
In [68]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell1_22_']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, PDR_differences, how='inner')
print(pairs5.shape)
pairs5 = pairs5.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs5.PDR_difference # dependent variable to predict
X = pairs5.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell1_22_', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[68]:
In [69]:
print("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs5, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[69]:
In [70]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[70]:
In [71]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs5, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell1_22_")
Out[71]:
In [ ]:
In [72]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell23_44']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5a = pd.merge(out, PDR_differences, how='inner')
print(pairs5a.shape)
pairs5a = pairs5a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs5a.PDR_difference # dependent variable to predict
X = pairs5a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell23_44', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[72]:
In [73]:
print("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs5a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[73]:
In [74]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[74]:
In [75]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs5a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell23_44")
Out[75]:
In [ ]:
In [76]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell45_66']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5b = pd.merge(out, PDR_differences, how='inner')
print(pairs5b.shape)
pairs5b = pairs5b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs5b.PDR_difference # dependent variable to predict
X = pairs5b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell45_66', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[76]:
In [77]:
print("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs5b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[77]:
In [78]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[78]:
In [79]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs5b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'RRBS_NormalBCD19pcell45_66")
Out[79]:
In [ ]:
In [80]:
CD19cell = merged[merged["protocol"] == 'RRBS_NormalBCD19pcell67_88']
print(len(CD19cell))
CD19cell = CD19cell.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
CD19cell = CD19cell.reset_index(drop=True)
CD19cellA = CD19cell.set_index("filename")
from itertools import combinations
cc = list(combinations(CD19cell.filename,2))
out = pd.DataFrame([CD19cellA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(CD19cell.PDR_unweighted, CD19cell.PDR_unweighted)), CD19cell.filename, CD19cell.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5c = pd.merge(out, PDR_differences, how='inner')
print(pairs5c.shape)
pairs5c = pairs5c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs5c.PDR_difference # dependent variable to predict
X = pairs5c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'RRBS_NormalBCD19pcell67_88', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[80]:
In [81]:
print("PDR_difference vs. bsRate, jointplot, Normal 'RRBS_NormalBCD19pcell67_88")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs5c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[81]:
In [82]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[82]:
In [83]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs5c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'NormalBCD19pCD27mcell67_88")
Out[83]:
In [ ]:
In [84]:
normb = merged[merged["protocol"] == 'normal_B_cell_A1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, PDR_differences, how='inner')
print(pairs6.shape)
pairs6 = pairs6.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6.PDR_difference # dependent variable to predict
X = pairs6.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_A1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[84]:
In [85]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_A1_24")
Out[85]:
In [86]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")
Out[86]:
In [87]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_A1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_A1_24")
Out[87]:
In [ ]:
In [88]:
normb = merged[merged["protocol"] == 'normal_B_cell_B1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6a = pd.merge(out, PDR_differences, how='inner')
print(pairs6a.shape)
pairs6a = pairs6a.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6a.PDR_difference # dependent variable to predict
X = pairs6a.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_B1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[88]:
In [89]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6a, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_B1_24")
Out[89]:
In [90]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")
Out[90]:
In [91]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_B1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6a, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_B1_24")
Out[91]:
In [ ]:
In [92]:
normb = merged[merged["protocol"] == 'normal_B_cell_C1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6b = pd.merge(out, PDR_differences, how='inner')
print(pairs6b.shape)
pairs6b = pairs6b.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6b.PDR_difference # dependent variable to predict
X = pairs6b.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_C1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[92]:
In [93]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6b, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_C1_24")
Out[93]:
In [94]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")
Out[94]:
In [95]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_C1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6b, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_C1_24")
Out[95]:
In [ ]:
In [96]:
normb = merged[merged["protocol"] == 'normal_B_cell_D1_24']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6c = pd.merge(out, PDR_differences, how='inner')
print(pairs6c.shape)
pairs6c = pairs6c.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6c.PDR_difference # dependent variable to predict
X = pairs6c.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_D1_24', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[96]:
In [97]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6c, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_D1_24")
Out[97]:
In [98]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")
Out[98]:
In [99]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_D1_24")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6c, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_D1_24")
Out[99]:
In [ ]:
In [100]:
normb = merged[merged["protocol"] == 'normal_B_cell_G1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6d = pd.merge(out, PDR_differences, how='inner')
print(pairs6d.shape)
pairs6d = pairs6d.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6d.PDR_difference # dependent variable to predict
X = pairs6d.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_G1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[100]:
In [101]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6d, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_G1_22")
Out[101]:
In [102]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")
Out[102]:
In [103]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_G1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6d, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_G1_22")
Out[103]:
In [ ]:
In [104]:
normb = merged[merged["protocol"] == 'normal_B_cell_H1_22']
print(len(normb))
normb = normb.drop(["thisMeth", "mixedReadCount", "bio", "protocol"], axis=1) # will not need these columns; efficiency
normb = normb.reset_index(drop=True)
normbA = normb.set_index("filename")
from itertools import combinations
cc = list(combinations(normb.filename,2))
out = pd.DataFrame([normbA.loc[c,:].mean() for c in cc], index=cc)
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normb.PDR_unweighted, normb.PDR_unweighted)), normb.filename, normb.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6e = pd.merge(out, PDR_differences, how='inner')
print(pairs6e.shape)
pairs6e = pairs6e.rename(columns = {'total_reads':'total_reads_mean', "bsRate":"bsRate_mean", "avgReadCpgs_gtreql3.8CpG":"avgReadCpG_mean"})
y = pairs6e.PDR_difference # dependent variable to predict
X = pairs6e.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
print(y.shape)
print(X.shape)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results for Normal 'normal_B_cell_H1_22', predict \delta PDR_unweighted")
print("")
print("Variates used are " + str(X.columns))
print("")
est.summary()
Out[104]:
In [105]:
print("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs6e, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal 'normal_B_cell_H1_22")
Out[105]:
In [106]:
print("PDR_difference vs. avgReadCpGs, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")
Out[106]:
In [107]:
print("PDR_difference vs. total unique CpGs per cell, jointplot, Normal 'normal_B_cell_H1_22")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=pairs6e, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, Normal 'normal_B_cell_H1_22")
Out[107]:
In [108]:
print(pairs1.shape)
print(pairs1a.shape)
print(pairs2.shape)
print(pairs2a.shape)
print(pairs2b.shape)
print(pairs3.shape)
print(pairs3a.shape)
print(pairs3b.shape)
print(pairs3c.shape)
print(pairs4.shape)
print(pairs4a.shape)
print(pairs4b.shape)
print(pairs4c.shape)
print(pairs5.shape)
print(pairs5a.shape)
print(pairs5b.shape)
print(pairs5c.shape)
print(pairs6.shape)
print(pairs6a.shape)
print(pairs6b.shape)
print(pairs6c.shape)
print(pairs6d.shape)
print(pairs6e.shape)
In [109]:
pairs1['type'] = str('CLL')
pairs1a['type'] = str('CLL')
pairs2['type'] = str('CLL')
pairs2a['type'] = str('CLL')
pairs2b['type'] = str('CLL')
pairs3['type'] = str('normal')
pairs3a['type'] = str('normal')
pairs3b['type'] = str('normal')
pairs3c['type'] = str('normal')
pairs4['type'] = str('normal')
pairs4a['type'] = str('normal')
pairs4b['type'] = str('normal')
pairs4c['type'] = str('normal')
pairs5['type'] = str('normal')
pairs5a['type'] = str('normal')
pairs5b['type'] = str('normal')
pairs5c['type'] = str('normal')
pairs6['type'] = str('normal')
pairs6a['type'] = str('normal')
pairs6b['type'] = str('normal')
pairs6c['type'] = str('normal')
pairs6d['type'] = str('normal')
pairs6e['type'] = str('normal')
frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b, pairs3, pairs3a, pairs3b, pairs3c,
pairs4, pairs4a, pairs4b, pairs4c, pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]
In [110]:
total_pairs = pd.concat(frames)
In [111]:
total_pairs.shape
Out[111]:
In [112]:
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG"], axis=1)
In [113]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
X = X.drop(['type_normal'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR_unweighted")
est.summary()
Out[113]:
In [114]:
X.head()
Out[114]:
In [115]:
print("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
Out[115]:
In [116]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, both CLL and normal ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, both CLL and normal ")
Out[116]:
In [117]:
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, jointplot, both CLL and normal ")
Out[117]:
In [118]:
print("PDR_difference vs. bsRate, by type! ")
sns.lmplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, hue='type')
plt.title("PDR_difference vs. bsRate, by type, CLL vs normal")
Out[118]:
In [119]:
print("PDR_difference vs. avgReadCpG per cell, by type! ")
sns.lmplot(x="avgReadCpG_mean", y="PDR_difference", data=total_pairs, hue='type')
plt.title("PDR_difference vs. avgReadCpG per cell, by type, CLL vs Normal ")
Out[119]:
In [120]:
print("PDR_difference vs. total unique CpG per cel, by type!")
sns.lmplot(x="total_cpg_no_filter", y="PDR_difference", data=total_pairs, hue='type')
plt.title("PDR_difference vs. total unique CpG per cell, by type, CLL vs Normal ")
Out[120]:
In [121]:
#
# Let's see feature ranking
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: cell type, avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [122]:
#
# Now do analysis by cell type, CLL vs normal, entire
#
cll_frames = [pairs1, pairs1a, pairs2, pairs2a, pairs2b]
normal_frames = [pairs3, pairs3a, pairs3b, pairs3c, pairs4, pairs4a, pairs4b, pairs4c,
pairs5, pairs5a, pairs5b, pairs5c, pairs6, pairs6a, pairs6b, pairs6c, pairs6d, pairs6e]
cll_pairs = pd.concat(cll_frames)
print(cll_pairs.shape)
normal_pairs = pd.concat(normal_frames)
print(normal_pairs.shape)
In [123]:
#
# CLL first
#
y = cll_pairs.PDR_difference # dependent variable
X = cll_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()
Out[123]:
In [124]:
print("PDR_difference vs. bsRate, jointplot, CLL pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CLL pairs ")
Out[124]:
In [125]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, CLL pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, CLL pairs ")
Out[125]:
In [126]:
print("PDR difference vs. total # unique CpGs per cell, CLL pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, CLL pairs")
Out[126]:
In [127]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [ ]:
In [128]:
#
# normal
#
y = normal_pairs.PDR_difference # dependent variable
X = normal_pairs.drop(["PDR_difference", "filename", "methylation", "PDR_total", "methylation_unweighted", "PDR_unweighted", "total_reads_mean",
"total_cpg_gtrthan1", "total_cpg_gtrthan38", "avgReadCpgs_nofilter", "avgReadCpgs_lessthan1CpG", "type"], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print("Regression results, all batches 'Normal B' vs 'CLL' , predict \delta PDR")
print(X.shape)
est.summary()
Out[128]:
In [129]:
print("PDR_difference vs. bsRate, jointplot, Normal pairs ")
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, Normal pairs ")
Out[129]:
In [130]:
print("PDR_difference vs. mean avgReadCpG per cell, jointplot, Normal pairs ")
sns.jointplot(x="avgReadCpG_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpG_mean, jointplot, Normal pairs ")
Out[130]:
In [131]:
print("PDR diference vs. total # unique CpGs per cell, Normal pairs")
sns.jointplot(x="total_cpg_no_filter", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. total # Unique_CpGs per cell, Normal pairs")
Out[131]:
In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=10000, oob_score=True, random_state=42) # random state == replicability,
model.fit(X, y) # 42 --- cf. Douglas Adams
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(str("Random Forest model score is ") + str(model.score(X,y)))
In [ ]:
In [ ]:
In [ ]:
In [ ]: