In [1]:
%matplotlib inline
In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
In [3]:
#
# CLL_cells_PDR_total_chr2_5_11A.csv
# CLL_cells_PDR_total_chr2_5_11B.csv
# CLL_cells_PDR_total_chr2_5_11_tritoC.csv
# Normal_cells_PDR_total_chr2_5_11A.csv
# Normal_cells_PDR_total_chr2_5_11B.csv
# Normal_cells_PDR_total_chr2_5_11C_mcells.csv
#
In [4]:
"""
RRBS_NormalBCD19pCD27pcell_CpGs.csv
RRBS_NormalBCD19pCD27mcell_CpGs.csv
Meth_PDR_cell_RRBS_normal_B1_CpGs.csv
Meth_PDR_cell_RRBS_trito_pool_CpGs.csv
CLL_RRBS_cw154_A_CpGs.csv
"""
Out[4]:
In [5]:
normal_cellA_df = pd.read_csv("Meth_PDR_cell_RRBS_normal_B1.csv")
normal_cellB_df = pd.read_csv("Meth_PDR_cell_normalpcell.csv")
normal_cellC_df = pd.read_csv("Meth_PDR_cell_normal_mcell.csv")
In [ ]:
In [6]:
normal_cellA_df = normal_cellA_df.drop(["Unnamed: 0"], axis=1)
In [7]:
normal_cellA_df["type"] = str('normal')
In [8]:
normal_cellA_df["bio"] = str('normal_B')
In [9]:
normal_cellA_df["protocol"] = normal_cellA_df["filename"].str[5:24]
In [10]:
normal_cellA_df["filename"] = normal_cellA_df["filename"].str[:40]
In [11]:
normal_cellA_df.head()
Out[11]:
In [12]:
cpg1 = pd.read_csv('Meth_PDR_cell_RRBS_normal_B1_CpGs.csv')
In [13]:
cpg1 = cpg1.drop(["Unnamed: 0"], axis=1)
In [14]:
cpg1["filename"] = cpg1["filename"].str[:40]
In [15]:
cpg1.head()
Out[15]:
In [16]:
normal_cellA_df = pd.merge(normal_cellA_df, cpg1, how='inner')
In [17]:
normal_cellB_df["type"] = str('normal')
In [18]:
normal_cellB_df = normal_cellB_df.drop(["Unnamed: 0"], axis=1)
In [19]:
normal_cellB_df.head()
Out[19]:
In [20]:
normal_cellB_df["bio"] = str('CD27p')
In [21]:
normal_cellB_df["protocol"] = normal_cellB_df["filename"].str[5:31]
In [22]:
normal_cellB_df["filename"] = normal_cellB_df["filename"].str[:50]
In [23]:
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.dan$', '')
normal_cellB_df["filename"] = normal_cellB_df["filename"].str.replace(r'.da$', '')
In [24]:
normal_cellB_df.head()
Out[24]:
In [25]:
normal_cellB_df.tail()
Out[25]:
In [26]:
cpg2 = pd.read_csv('RRBS_NormalBCD19pCD27pcell_CpGs.csv')
In [27]:
cpg2 = cpg2.drop(["Unnamed: 0"], axis=1)
In [28]:
cpg2["filename"] = cpg2["filename"].str[:50]
In [29]:
cpg2["filename"] = cpg2["filename"].str.replace(r'.dan$', '')
cpg2["filename"] = cpg2["filename"].str.replace(r'.da$', '')
In [30]:
normal_cellB_df = pd.merge(normal_cellB_df, cpg2, how='inner')
In [31]:
normal_cellC_df = normal_cellC_df.drop(["Unnamed: 0"], axis=1)
In [32]:
normal_cellC_df["type"] = str('normal')
In [33]:
normal_cellC_df["protocol"] = normal_cellC_df["filename"].str[5:31]
In [34]:
normal_cellC_df["bio"] = str('CD27m')
In [35]:
normal_cellC_df["filename"] = normal_cellC_df["filename"].str[:50]
In [36]:
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.dan$', '')
normal_cellC_df["filename"] = normal_cellC_df["filename"].str.replace(r'.da$', '')
In [ ]:
In [37]:
cpg3 = pd.read_csv("RRBS_NormalBCD19pCD27mcell_CpGs.csv")
In [38]:
cpg3 = cpg3.drop(["Unnamed: 0"], axis=1)
In [39]:
cpg3["filename"] = cpg3["filename"].str[:50]
In [40]:
cpg3["filename"] = cpg3["filename"].str.replace(r'.dan$', '')
cpg3["filename"] = cpg3["filename"].str.replace(r'.da$', '')
In [41]:
normal_cellC_df = pd.merge(normal_cellC_df, cpg3, how='inner')
In [42]:
frames3 = [normal_cellA_df, normal_cellB_df, normal_cellC_df]
normal_result = pd.concat(frames3)
In [43]:
normal_result.shape
Out[43]:
In [44]:
normal_result.columns
Out[44]:
In [45]:
normal_result = normal_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
In [46]:
normal_result.head()
Out[46]:
In [47]:
cll_cellA_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_cw154_A.csv")
# cll_cellB_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_NormalB_CLL_B.csv")
cll_cellC_df = pd.read_csv("Meth_PDR_cell_CLL_RRBS_trito_pool_C.csv")
In [ ]:
In [48]:
cll_cellA_df = cll_cellA_df.drop(["Unnamed: 0"], axis=1)
In [49]:
cll_cellA_df["type"] = str('CLL')
In [50]:
cll_cellA_df["protocol"] = cll_cellA_df["filename"].str[5:34]
In [51]:
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_CutSmart_proteinase_K_T'] = 'cw154_CutSmart_proteinase_K'
In [52]:
cll_cellA_df["protocol"][cll_cellA_df["protocol"] == 'cw154_Tris_protease_GR_CAGAGA'] = 'cw154_Tris_protease_GR'
In [53]:
cll_cellA_df["protocol"][(cll_cellA_df["protocol"] != 'cw154_Tris_protease_GR') & (cll_cellA_df["protocol"] != 'cw154_CutSmart_proteinase_K')] = 'cw154_Tris_protease'
In [ ]:
In [54]:
cll_cellA_df["bio"] = str('CLL')
In [55]:
cll_cellA_df["filename"] = cll_cellA_df["filename"].str[:51]
In [56]:
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.da$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.annoRR$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.ann$', '')
cll_cellA_df["filename"] = cll_cellA_df["filename"].str.replace(r'.dan$', '')
In [ ]:
In [57]:
cpg4 = pd.read_csv('CLL_RRBS_cw154_A_CpGs.csv')
In [58]:
cpg4 = cpg4.drop(["Unnamed: 0"], axis=1)
In [59]:
cpg4["filename"] = cpg4["filename"].str[:51]
In [60]:
cpg4["filename"] = cpg4["filename"].str.replace(r'.da$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.annoRR$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.ann$', '')
cpg4["filename"] = cpg4["filename"].str.replace(r'.dan$', '')
In [61]:
cll_cellA_df = pd.merge(cll_cellA_df, cpg4, how='inner')
In [ ]:
In [62]:
cll_cellC_df = cll_cellC_df.drop(["Unnamed: 0"], axis=1)
In [63]:
cll_cellC_df["type"] = str('CLL')
In [64]:
cll_cellC_df["bio"] = str('CLL')
In [65]:
cll_cellC_df["protocol"] = cll_cellC_df["filename"].str[5:17]
In [66]:
cll_cellC_df["filename"] = cll_cellC_df["filename"].str[:33]
In [67]:
cll_cellC_df.head()
Out[67]:
In [ ]:
In [68]:
cpg5 = pd.read_csv('Meth_PDR_cell_RRBS_trito_pool_CpGs.csv')
In [69]:
cpg5 = cpg5.drop(["Unnamed: 0"], axis=1)
In [70]:
cpg5["filename"] = cpg5["filename"].str[:33]
In [71]:
cll_cellC_df = pd.merge(cll_cellC_df, cpg5, how='inner')
In [ ]:
In [72]:
frames2 = [cll_cellA_df, cll_cellC_df]
cll_result = pd.concat(frames2)
In [73]:
cll_result.shape
Out[73]:
In [74]:
cll_result.head()
Out[74]:
In [75]:
cll_result.columns
Out[75]:
In [76]:
cll_result = cll_result[['filename', 'PDR_total', 'total_reads', 'type', 'bio', 'protocol', 'avgReadCpGs_mean', 'avgReadCpGs_median']]
In [77]:
cll_result = cll_result.reset_index(drop=True)
In [78]:
normal_result = normal_result.reset_index(drop=True)
In [79]:
combined2 = normal_result.append(cll_result)
In [80]:
combined2 = combined2.reset_index(drop=True)
In [81]:
combined2.head()
Out[81]:
In [ ]:
In [82]:
# Variables
#
# totCpG ---- Number of unique CpGs per cell
# - Median Average Read CpG per cell (or mean if normally distributed)
# bsRate ---- BS rate per cell
# - CLL or Normal status per cell
# What is the coefficient and the P value for each variable for a model predicting PDR?
In [ ]:
In [83]:
bs = pd.read_table('allStats.txt')
bs = bs.drop('sample.1', axis=1)
bs = bs.drop('sample.2', axis=1)
In [84]:
bs = bs.reset_index(drop=True)
In [85]:
bs = bs.drop('class', axis=1)
bs = bs.drop('totMeth', axis=1)
bs = bs.drop('totSeen', axis=1)
bs = bs.drop('avSum', axis=1)
bs = bs.drop('avTot', axis=1)
bs = bs.drop('rMixed', axis=1)
bs = bs.drop('rTot', axis=1)
bs = bs.drop('rAv', axis=1)
bs = bs.drop('rAvTot', axis=1)
bs = bs.drop('bed', axis=1)
bs = bs.drop('methInfoFile', axis=1)
bs = bs.drop('totReads', axis=1)
bs = bs.drop('totAligned', axis=1)
bs = bs.drop('totClipped', axis=1)
bs = bs.drop('totSeenCpG', axis=1)
bs = bs.drop('totUsed', axis=1)
bs = bs.drop('totMethCpG', axis=1)
# bs = bs.drop('totCpG', axis=1)
bs = bs.drop('totalReadPairs', axis=1)
bs = bs.drop('alignedReads', axis=1)
bs = bs.drop('totalReads', axis=1)
In [86]:
bs.head()
Out[86]:
In [87]:
bs = bs.rename(columns = {'sample':'filename'})
In [ ]:
In [88]:
merged = pd.merge(combined2, bs, how='inner')
In [89]:
merged = merged.reset_index(drop=True)
In [90]:
merged = merged.rename(columns = {'PDR_total':'PDR'})
In [91]:
merged = merged.rename(columns = {'totCpG':'Unique_CpGs'})
In [92]:
merged.head()
Out[92]:
In [93]:
# Remove all data points with less than 100k in totcpg
merged = merged[merged['total_reads'] > 100000]
In [94]:
scattermatrix1 = merged.drop(['filename', 'total_reads', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)
In [95]:
scattermatrix1.head()
Out[95]:
In [96]:
#sns.lmplot(x="bsRate", y="PDR", row="bio", col="Unique_CpGs", data=scattermatrix1)
In [97]:
sns.lmplot(x="bsRate", y="PDR", data=scattermatrix1)
Out[97]:
In [ ]:
In [ ]:
In [ ]:
In [98]:
sns.pairplot(scattermatrix1, hue='bio')
plt.title('Scatterplot, bio')
Out[98]:
In [99]:
scattermatrix2 = merged.drop(['filename', 'type', 'protocol', 'avgReadCpGs_median'], axis=1)
In [100]:
scattermatrix2.head()
Out[100]:
In [101]:
sns.pairplot(scattermatrix2, hue='bio')
Out[101]:
In [ ]:
In [ ]:
In [228]:
merged.head()
Out[228]:
In [229]:
scattermatrix3 = merged.drop(['filename', 'total_reads', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)
In [230]:
sns.pairplot(scattermatrix3, hue='type', palette='Set2')
Out[230]:
In [231]:
scattermatrix4 = merged.drop(['filename', 'bio', 'protocol', 'avgReadCpGs_median'], axis=1)
In [232]:
sns.pairplot(scattermatrix4, hue='type', palette='Set1')
Out[232]:
In [233]:
scattermatrix5 = merged.drop(['filename','total_reads', 'bio', 'type', 'avgReadCpGs_median'], axis=1)
In [234]:
sns.pairplot(scattermatrix5, hue='protocol')
Out[234]:
In [235]:
scattermatrix6 = merged.drop(['filename', 'bio', 'type', 'avgReadCpGs_median'], axis=1)
In [236]:
sns.pairplot(scattermatrix6, hue='protocol')
Out[236]:
In [237]:
scattermatrix7 = merged.drop(['filename', 'bio', 'avgReadCpGs_median'], axis=1)
In [238]:
scattermatrix7.columns
Out[238]:
In [239]:
y = scattermatrix7.PDR # dependent variable
In [ ]:
In [240]:
y.shape
Out[240]:
In [241]:
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)
In [242]:
X.shape
Out[242]:
In [243]:
X.head()
Out[243]:
In [244]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
In [245]:
X.head()
Out[245]:
In [246]:
# only use type_CLL
X_CLL = X.drop("type_normal", axis=1)
X_CLL.head()
Out[246]:
In [247]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(X,y)
Out[247]:
In [248]:
X.columns
Out[248]:
In [249]:
print(clf.coef_)
In [250]:
clf.score(X,y) # Returns the coefficient of determination R^2 of the prediction.
Out[250]:
In [251]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X, y)[1])
In [252]:
clf = linear_model.LinearRegression()
clf.fit(X_CLL,y) # it's either CLL 0 or CLL 1
Out[252]:
In [253]:
X_CLL.columns
Out[253]:
In [254]:
print(clf.coef_)
In [255]:
clf.score(X_CLL,y)
Out[255]:
In [256]:
print("p-values are: ")
print(sklearn.feature_selection.f_regression(X_CLL, y)[1])
In [257]:
# Ridge
clf = linear_model.BayesianRidge()
clf.fit(X,y)
Out[257]:
In [126]:
print(clf.coef_)
In [127]:
clf.score(X,y)
Out[127]:
In [128]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])
In [129]:
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)
Out[129]:
In [130]:
print(clf.coef_) # why zero coefficients?
In [131]:
clf.score(X,y) # Why is this so terrible?
Out[131]:
In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
Out[132]:
In [133]:
X.columns
Out[133]:
In [134]:
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: avgReadCpGs_mean, Unique_CpGs, bsRate, type")
Out[134]:
In [135]:
model.score(X,y)
Out[135]:
In [136]:
scattermatrix7.columns
Out[136]:
In [137]:
sns.lmplot(x="bsRate", y="PDR", data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")
Out[137]:
In [138]:
sns.lmplot(x="bsRate", y="PDR", data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")
Out[138]:
In [ ]:
In [139]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR", data=scattermatrix7, hue='type')
plt.title("PDR vs. bsRate, by type")
Out[139]:
In [140]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR", data=scattermatrix7, hue='protocol')
plt.title("PDR vs. bsRate, by protocol")
Out[140]:
In [ ]:
In [ ]:
In [ ]:
In [141]:
sns.jointplot(x="bsRate", y="PDR", data=scattermatrix7, kind="reg")
plt.title("PDR vs. bsRate, jointplot")
Out[141]:
In [142]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR", data=scattermatrix7, kind="reg")
Out[142]:
In [143]:
sns.jointplot(x="Unique_CpGs", y="PDR", data=scattermatrix7, kind="reg")
plt.title("Unique CpGs vs PDR")
Out[143]:
In [ ]:
In [144]:
type_cll = scattermatrix7[scattermatrix7.type == 'CLL']
type_normal = scattermatrix7[scattermatrix7.type == 'normal']
In [145]:
sns.jointplot(x="bsRate", y="PDR", data=type_cll, kind="reg", color='r')
plt.title("PDR vs. bsRate, CLL samples, jointplot")
Out[145]:
In [146]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR", data=type_cll, kind="reg", color='r')
Out[146]:
In [147]:
sns.jointplot(x="Unique_CpGs", y="PDR", data=type_cll, kind="reg", color='r')
plt.title("Unique CpGs vs PDR")
Out[147]:
In [ ]:
In [148]:
sns.jointplot(x="bsRate", y="PDR", data=type_normal, kind="reg", color='g')
plt.title("PDR vs. bsRate, normal samples, jointplot")
Out[148]:
In [149]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR", data=type_normal, kind="reg", color='g')
Out[149]:
In [150]:
sns.jointplot(x="Unique_CpGs", y="PDR", data=type_normal, kind="reg", color='g')
plt.title("Unique CpGs vs PDR")
Out[150]:
In [ ]:
In [151]:
# Can you try a model which just two variables: bs conversion and cll_vs-normal?
In [152]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.bio, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
Out[152]:
In [153]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.type, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
Out[153]:
In [154]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=scattermatrix7["type"], y=scattermatrix7["bsRate"], hue=combined2.protocol, jitter=True)
#sns.plt.title("Biological sorting: Total PDR per cell, normal vs CLL, chromosomes 2, 5, & 11")
#plt.xlabel("Normal cells = 304 *.anno files; CLL cells = 112 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
Out[154]:
In [155]:
y = scattermatrix7.PDR # dependent variable
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)
In [156]:
X.shape
Out[156]:
In [157]:
X.head()
Out[157]:
In [158]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
In [159]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(X,y)
Out[159]:
In [160]:
X.columns
Out[160]:
In [161]:
print(clf.coef_)
In [162]:
clf.score(X,y)
Out[162]:
In [163]:
# Lasso
clf = linear_model.Lasso()
clf.fit(X,y)
Out[163]:
In [164]:
print(clf.coef_)
In [165]:
clf.score(X,y)
Out[165]:
In [166]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
Out[166]:
In [167]:
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance: two variables: bs conversion and cll_vs-normal")
Out[167]:
In [168]:
model.score(X,y)
Out[168]:
In [169]:
import sklearn
sklearn.feature_selection.f_regression(X, y)
print(sklearn.feature_selection.f_regression(X, y)[1])
# Returns:
# F : array, shape=(n_features,)
# F values of features.
# pval : array, shape=(n_features,)
# p-values of F-scores.
In [258]:
# multiple regression model describes the response as a weighted sum of the predictors:
# 'avgReadCpGs_mean', 'Unique_CpGs', 'bsRate', 'type_CLL'
#
# PDR = \beta_0 + \beta_1 \avgReadCpGs_mean + \beta_2 \Unique_CpGs + \beta_3 \bsRate, \beta_4 \type_CLL
#
In [259]:
import statsmodels.api as sm
In [263]:
y = scattermatrix7.PDR # dependent variable
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol'], axis=1)
In [264]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
In [265]:
X.columns
Out[265]:
In [266]:
X = X.drop('type_CLL', axis=1)
In [267]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
In [268]:
est.summary()
Out[268]:
In [280]:
y = scattermatrix7.PDR # dependent variable
X = scattermatrix7.drop(['PDR', 'total_reads', 'protocol', 'avgReadCpGs_mean', 'Unique_CpGs'], axis=1)
In [281]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
In [282]:
X.columns
Out[282]:
In [283]:
X = X.drop('type_normal', axis=1)
In [428]:
X.columns
Out[428]:
In [429]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
In [430]:
est.summary()
Out[430]:
In [ ]:
In [523]:
tritopool = merged[merged["protocol"] == 'trito_pool_1']
In [ ]:
In [524]:
tritopool = tritopool.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
In [525]:
tritopool = tritopool.reset_index(drop=True)
In [526]:
tritopool.head()
Out[526]:
In [ ]:
In [527]:
tritopoolA = tritopool.set_index("filename")
In [528]:
from itertools import combinations
cc = list(combinations(tritopool.filename,2))
In [529]:
len(cc)
Out[529]:
In [530]:
out = pd.DataFrame([tritopoolA.loc[c,:].mean() for c in cc], index=cc)
In [531]:
out.head()
Out[531]:
In [532]:
tritopoolA
Out[532]:
In [533]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(tritopool.PDR, tritopool.PDR)), tritopool.filename, tritopool.filename)
In [534]:
df_ex.head()
Out[534]:
In [535]:
row = df_ex.iloc[0]
In [536]:
row.values
Out[536]:
In [537]:
pdr = tritopool[['filename', 'PDR']]
In [538]:
pdr = pdr.set_index("filename")
In [539]:
pdr
Out[539]:
In [540]:
stacked = df_ex.stack()
In [541]:
#stacked.index
In [542]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
In [543]:
pdr_differences.head()
Out[543]:
In [544]:
out['filename'] = out.index
In [545]:
out = out.reset_index(drop=True)
In [546]:
out.head()
Out[546]:
In [547]:
pairs = pd.merge(out, pdr_differences, how='inner')
In [548]:
pairs.head()
Out[548]:
In [549]:
pairs.shape
Out[549]:
In [550]:
"""
2. We need a regression model for predicting PDR distance between any two cells (highest priority):
Calculate the absolute difference in PDR between any two cells. Choose only pairs of cells that are in the same batch to control for batch effects.
For each pair, generate an average of the two cells for: BS rate, number of unique CpG, Median Average Read CpG per cell.
Variables of the regression model are:
- Number of unique CpGs per pair
- Median Average Read CpG per pair (or mean if normally distributed)
- BS rate per pair
- CLL or Normal per pair
What is the coefficient and the P value for each variable for a model predicting PDR distance between two cells?
"""
Out[550]:
In [551]:
pairs = pairs.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
In [552]:
pairs.head()
Out[552]:
In [553]:
pairs.columns
Out[553]:
In [554]:
y = pairs.PDR_difference # dependent variable
X = pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
In [555]:
y.shape
Out[555]:
In [556]:
X.shape
Out[556]:
In [557]:
X.head()
Out[557]:
In [558]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
In [559]:
est.summary()
Out[559]:
In [560]:
pairs.columns
Out[560]:
In [585]:
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, RRBS_trito_pool CLL trito_1")
Out[585]:
In [587]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference", data=pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[587]:
In [588]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference", data=pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, RRBS_trito_pool CLL trito_1")
Out[588]:
In [564]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
Out[564]:
In [565]:
model.score(X,y) # RF model score
Out[565]:
In [567]:
# cw154_Tris_protease
# NormalBCD19pCD27pcell23_44
# NormalBCD19mCD27pcell23_44
# normal_B_cell_C1_24
In [568]:
cw154 = merged[merged["protocol"] == 'cw154_Tris_protease']
In [569]:
cw154 = cw154.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
In [570]:
cw154 = cw154.reset_index(drop=True)
In [571]:
cw154A = cw154.set_index("filename")
from itertools import combinations
cc = list(combinations(cw154.filename,2))
out = pd.DataFrame([cw154A.loc[c,:].mean() for c in cc], index=cc)
In [572]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(cw154.PDR, cw154.PDR)), cw154.filename, cw154.filename)
In [573]:
stacked = df_ex.stack()
In [574]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
In [575]:
pdr_differences.head()
Out[575]:
In [576]:
out['filename'] = out.index
out = out.reset_index(drop=True)
In [577]:
pairs2 = pd.merge(out, pdr_differences, how='inner')
In [578]:
pairs2 = pairs2.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
In [579]:
pairs2.head()
Out[579]:
In [582]:
y = pairs2.PDR_difference # dependent variable
X = pairs2.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
In [583]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
Out[583]:
In [589]:
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, cw154_Tris_protease")
Out[589]:
In [590]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, cw154_Tris_protease")
Out[590]:
In [591]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference", data=pairs2, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, cw154_Tris_protease")
Out[591]:
In [595]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))
In [596]:
pcell = merged[merged["protocol"] == 'NormalBCD19pCD27pcell23_44']
pcell = pcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
pcell = pcell.reset_index(drop=True)
In [597]:
pcellA = pcell.set_index("filename")
from itertools import combinations
cc = list(combinations(pcell.filename,2))
out = pd.DataFrame([pcellA.loc[c,:].mean() for c in cc], index=cc)
In [598]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(pcell.PDR, pcell.PDR)), pcell.filename, pcell.filename)
In [599]:
stacked = df_ex.stack()
In [600]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
In [601]:
out['filename'] = out.index
out = out.reset_index(drop=True)
In [602]:
pairs3 = pd.merge(out, pdr_differences, how='inner')
pairs3 = pairs3.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
In [603]:
y = pairs3.PDR_difference # dependent variable
X = pairs3.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
Out[603]:
In [604]:
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 pcell ")
Out[604]:
In [605]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 pcell")
Out[605]:
In [606]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference", data=pairs3, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 pcell")
Out[606]:
In [607]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))
In [623]:
mcell = merged[merged["protocol"] == 'NormalBCD19pCD27mcell23_44']
mcell = mcell.drop(["type", "bio", "protocol", "avgReadCpGs_median"], axis=1)
mcell = mcell.reset_index(drop=True)
In [624]:
mcellA = mcell.set_index("filename")
from itertools import combinations
cc = list(combinations(mcell.filename,2))
out = pd.DataFrame([mcellA.loc[c,:].mean() for c in cc], index=cc)
In [625]:
df_ex = pd.DataFrame(np.abs(np.subtract.outer(mcell.PDR, mcell.PDR)), mcell.filename, mcell.filename)
In [626]:
stacked = df_ex.stack()
In [627]:
pdr_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
In [628]:
out['filename'] = out.index
out = out.reset_index(drop=True)
In [629]:
pairs4 = pd.merge(out, pdr_differences, how='inner')
pairs4 = pairs4.rename(columns = {'total_reads':'total_reads_mean', 'Unique_CpGs':'Unique_CpGs_mean', "bsRate":"bsRate_mean"})
In [632]:
y = pairs4.PDR_difference # dependent variable
X = pairs4.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
Out[632]:
In [633]:
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, CD27 mcell ")
Out[633]:
In [634]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, CD27 mcell")
Out[634]:
In [635]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference", data=pairs4, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, CD27 mcell")
Out[635]:
In [636]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))
In [639]:
pairs['type'] = str('CLL')
In [643]:
pairs2['type'] = str('CLL')
In [647]:
pairs3['type'] = str('normal')
In [648]:
pairs4['type'] = str('normal')
In [649]:
frames22 = [pairs, pairs2, pairs3, pairs4]
total_pairs = pd.concat(frames22)
In [650]:
total_pairs.head()
Out[650]:
In [651]:
total_pairs.columns
Out[651]:
In [ ]:
In [658]:
y = total_pairs.PDR_difference # dependent variable
X = total_pairs.drop(['PDR', 'PDR_difference', 'total_reads_mean', 'filename'], axis=1)
In [659]:
categorical_variables = ['type']
for variable in categorical_variables:
# Fill missing data with the word "Missing"
X[variable].fillna("Missing", inplace=True)
# Create array of dummies
dummies = pd.get_dummies(X[variable], prefix=variable)
# Update X to include dummies and drop the main variable
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
In [660]:
X.columns
Out[660]:
In [661]:
X = X.drop(['type_normal'], axis=1)
In [662]:
X.columns
Out[662]:
In [663]:
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()
Out[663]:
In [664]:
sns.jointplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. bsRate, jointplot, both CLL and normal ")
Out[664]:
In [670]:
sns.jointplot(x="avgReadCpGs_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. avgReadCpGs_mean, jointplot, both CLL and normal ")
Out[670]:
In [671]:
sns.jointplot(x="Unique_CpGs_mean", y="PDR_difference", data=total_pairs, kind="reg")
plt.title("PDR_difference vs. Unique_CpGs_mean, jointplot, both CLL and normal ")
Out[671]:
In [672]:
total_pairs.columns
Out[672]:
In [673]:
sns.lmplot(x="bsRate_mean", y="PDR_difference", data=total_pairs, hue='type')
Out[673]:
In [674]:
sns.lmplot(x="avgReadCpGs_mean", y="PDR_difference", data=total_pairs, hue='type')
Out[674]:
In [675]:
sns.lmplot(x="Unique_CpGs_mean", y="PDR_difference", data=total_pairs, hue='type')
Out[675]:
In [676]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=36)
model.fit(X, y)
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6))
plt.title("Feature importance predicting PDR_difference: avgReadCpGs_mean, Unique_CpGs_mean, bsRate_mean")
print(model.score(X,y))
In [ ]:
In [ ]:
In [ ]:
In [ ]: