In [1]:
%pylab inline
import statsmodels.api as sm
from statsmodels.graphics.api import abline_plot
In [2]:
np.random.seed(42)
# sites
N = 100
# reads
n_reads = np.random.poisson(lam=10, size=N)
x = np.random.normal(0, 2, N)
# make the x a probibility
p = np.exp(x)/(1+np.exp(x))
y = np.random.binomial(n=n_reads, p=p, size=N)
ratio = y/n_reads
ordered_x = x[x.argsort()]
ordered_y = y[x.argsort()]
ordered_reads = n_reads[x.argsort()]
ordered_ratio = ratio[x.argsort()]
In [3]:
fig = plt.figure(figsize=(12,6))
ax = fig.add_subplot(121)
ax.scatter(np.arange(1, len(ordered_x)+1), ordered_reads, color='black', label="reads")
ax.scatter(np.arange(1, len(ordered_x)+1), ordered_y, color='red', label="methylated")
ax.legend()
ax.set_title('Methylated reads and total number of reads')
ax = fig.add_subplot(122)
ax.scatter(np.arange(1, len(ordered_x)+1), ordered_y, color='red', label="methylated_ratio")
ax.legend()
ax.set_title('Methylated reads ratio')
Out[3]:
In [4]:
lm_model = sm.OLS(ratio, x)
lm_fit = lm_model.fit()
print(lm_fit.summary())
In [5]:
fig, ax = plt.subplots(figsize=(6,6))
fig = sm.graphics.plot_fit(lm_fit, 0, ax=ax)
In [6]:
y_pred_linear = lm_fit.predict(x)
fig, ax = plt.subplots()
ax.scatter(y_pred_linear, ratio)
line_fit = sm.OLS(ratio, sm.add_constant(y_pred_linear, prepend=True)).fit()
abline_plot(model_results=line_fit, ax=ax)
ax.set_title('Model Fit Plot')
ax.set_ylabel('Observed values')
ax.set_xlabel('Fitted values');
In [7]:
glm_model = sm.GLM(y/n_reads, x, family=sm.families.Binomial(), weights=n_reads)
glm_fit = glm_model.fit()
print(glm_fit.summary())
In [8]:
print(glm_fit.params)
In [9]:
nobs = glm_fit.nobs
y_actual = ratio
yhat = glm_fit.mu
In [10]:
fig, ax = plt.subplots()
ax.scatter(yhat, y_actual)
line_fit = sm.OLS(y_actual, sm.add_constant(yhat, prepend=True)).fit()
abline_plot(model_results=line_fit, ax=ax)
ax.set_title('Model Fit Plot')
ax.set_ylabel('Observed values')
ax.set_xlabel('Fitted values');