This notebook works through an idea for correcting heteroskedasticity which uses rotation matrices to align samples. Previous iterations of seqpyplot (SPPLOT v0.3 and below) output a scatter plot that plots the expression values of control and treated samples and against their means. These plots revealed what appeared to be a bias where one sample tended towards overall higher expression than the other. Given the type of data being analyzed (Time series RNA seq data), it seemedunlikely that there would be such a bias inherent to the experiment.
In this notebook, I propose a method for correcting this bias using linear regression and linear transformations to rotate one of the two samples prior to differential experesssion inference.
The steps to performing this transformation are as follows
References: Rotation Matrices: https://en.wikipedia.org/wiki/Rotation_matrix
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from random import randint as rand
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import euclidean_distances
from scipy.linalg import svd
from seqpyplot.container.data_container import DataContainer
from seqpyplot.parsers.config_parser import config_parser
from pathlib import Path
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 10)
pd.options.mode.chained_assignment = None
In [2]:
def calc_theta(coef1, coef2):
"Returns an angle in radians"
return np.abs(
np.arctan(np.abs(coef1 - coef2) / (1. + (coef1 * coef2)))
)
In [3]:
def compute_rot_mat(rad, coef=.5):
" Compute a rotation matrix using rad for a given regression coefficient "
if coef < 1.0:
rotation_matrix = np.array([[np.cos(rad), -np.sin(rad)],
[np.sin(rad), np.cos(rad)]])
else:
rotation_matrix = np.array([[np.cos(rad), np.sin(rad)],
[-np.sin(rad), np.cos(rad)]])
return rotation_matrix
In [4]:
slope1 = 1.1
slope2 = 2.0
line1 = np.array([slope1 * x for x in range(10)])
line2 = np.array([slope2 * x for x in range(10)])
xs = list(range(10))
In [5]:
# Plot lines
plt.plot(xs, line1, color='black', label='line to rotate');
plt.plot(xs, line2, color='red', linewidth=5, label='stationary');
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--');
plt.annotate(s=('<-- we\'ll rotate this line'), xy=(xs[3]+0.08, line1[3]), rotation=-36)
plt.annotate(s=('<-- Direction of rotation'), xy=(xs[3] + 0.75, line1[3]+3.5), rotation=-15)
plt.ylim((-1, 10))
plt.xlim((-1, 10))
plt.legend(loc='upper right');
In [6]:
# Compute angle
angle_diff = calc_theta(slope1, slope2)
angle_diff
Out[6]:
In [7]:
# Compute rotation matrix
rot_matrix = compute_rot_mat(angle_diff)
rot_matrix
Out[7]:
In [11]:
# rotate line 1 (black line)
new_line1 = list()
for x, y in zip(xs, line1):
# need shape [[#], [#]]
old_point = np.array([[x], [y]])
new_point = np.dot(old_point, rot_matrix)
new_line1.append(new_point)
new_line1 = np.squeeze(np.asarray(new_line1))
In [12]:
xs[6], line1[6]
Out[12]:
In [10]:
plt.plot(xs, line2, color='red', linewidth=5, alpha=0.7, label='Stationary');
plt.plot(xs, line1, color='black', label='line to rotate')
plt.scatter(new_line1[:, 0], new_line1[:, 1], color='black', s=95)
plt.plot(new_line1[:, 0], new_line1[:, 1], color='black', linestyle='--', label='rotated line')
plt.annotate(s=('Original Line'), xy=(xs[6] + 0.7, line1[6]+0.3))
plt.annotate(s=('<-- Direction of rotation'), xy=(xs[6] - 1.7, line1[6]+0.8), rotation=-36)
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
for (x1, y1), (x2, y2) in zip(zip(xs, line1), new_line1):
plt.plot([x1, x2], [y1, y2], linestyle='--', color='black', alpha=0.4)
plt.ylim((-1, 10))
plt.xlim((-1, 10));
plt.legend(loc='upper right');
In [12]:
config = '../examples/example_config.ini'
config_obj = config_parser(config)
In [13]:
# load the data container_obj
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
In [14]:
cols = data.columns
cols
Out[14]:
In [15]:
df = data[['D1_Cont', 'D1_Treat']]
df.loc[:, 'mean'] = df.mean(axis=1)
In [16]:
df.head()
Out[16]:
In [17]:
d1 = df[['D1_Cont', 'mean']]
d2 = df[['D1_Treat', 'mean']]
In [18]:
fig, ax = plt.subplots()
d1.plot('mean', 'D1_Cont', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='blue', alpha=0.4)
d2.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
plt.annotate('The bias between samples is clearly seen in this plot!', (500, 3900), fontsize=14)
ax.set_title("Raw unnormalized data");
In [19]:
# Quick reset for this cell
d1 = df[['D1_Cont', 'mean']]
d2 = df[['D1_Treat', 'mean']]
# define regression objects
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)
# fit regression
regCont.fit(d1['D1_Cont'].values.reshape(-1, 1), d1['mean'].values.reshape(-1, 1))
regTreat.fit(d2['D1_Treat'].values.reshape(-1, 1), d1['mean'].values.reshape(-1, 1))
print(regCont.coef_, regCont.intercept_)
print(regTreat.coef_, regTreat.intercept_)
# Correct bias
d1['D1_Cont'] = d1['D1_Cont'] - regCont.intercept_
d2['D1_Treat'] = d2['D1_Treat'] - regTreat.intercept_
# Plot regression lines
fig, ax = plt.subplots()
d1.plot('mean', 'D1_Cont', kind='scatter', ax=ax, color='blue', alpha=0.4)
d2.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 8000), ylim=(0, 8000), ax=ax, color='red', alpha=0.4)
plt.plot([0, 8000], [0.0, regCont.coef_ * 8000], linestyle='--', color='black')
plt.plot([0, 8000], [0.0, regTreat.coef_ * 8000], linestyle='--', color='black');
ax.set_title("bias corrected, with best fit lines");
plt.ylim((-500, 8000))
plt.xlim((-500, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [20]:
correction_theta = calc_theta(np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_))
correction_theta # in radians
rotation_matrix = compute_rot_mat(correction_theta)
rotation_matrix
new_treat = np.array([np.dot(rotation_matrix, d2.values[i, :]) for i in range(len(d2.values))])
new_treat
d2_cor = d2.copy()
d2_cor.loc[:, 'D1_Treat'] = new_treat[:, 0]
d2_cor.loc[:, 'mean'] = new_treat[:, 1]
fig, ax = plt.subplots()
d1.plot('mean', 'D1_Cont', kind='scatter', xlim=(0, 20000), ylim=(0, 20000), ax=ax, color='blue', alpha=0.4)
d2_cor.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
ax.set_title("No TMM, linearly transformed")
plt.ylim((-1000, 8000))
plt.xlim((-1000, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
Successs! We have eliminated the sample divergence!
In [21]:
# load the data container_obj
config = '../examples/example_config.ini'
config_obj = config_parser(config)
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
data = container_obj.normalize_file_pairs(data) # Single df of normalized data
normed_df = data[['D1_Cont', 'D1_Treat']].copy()
normed_df.loc[:, 'mean'] = normed_df.mean(axis=1)
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)
regCont.fit(normed_df['D1_Cont'].values.reshape(-1, 1), normed_df['mean'].values.reshape(-1, 1))
regTreat.fit(normed_df['D1_Treat'].values.reshape(-1, 1), normed_df['mean'].values.reshape(-1, 1))
normed_df['D1_Cont'] = normed_df['D1_Cont'] - regCont.intercept_
normed_df['D1_Treat'] = normed_df['D1_Treat'] - regTreat.intercept_
fig, ax = plt.subplots()
normed_df.plot('mean', 'D1_Cont', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='blue', alpha=0.4, label='Control')
normed_df.plot('mean', 'D1_Treat', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='red', alpha=0.4, label='Treated')
# plot regression lines, with color switch!
plt.plot([0, 8000], [0.0, regCont.coef_ * 8000], linestyle='--', color='black')
plt.plot([0, 4000], [0.0, regTreat.coef_ * 4000], linestyle='--', color='white');
plt.plot([4000, 8000], [regTreat.coef_[0] * 4000.0, regTreat.coef_[0] * 8000], linestyle='--', color='black');
ax.set_title("TMM normalized expression data");
plt.ylim((-500, 8000))
plt.xlim((-500, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [22]:
correction_theta = calc_theta(np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_))
rotation_matrix = compute_rot_mat(correction_theta, regTreat.coef_)
new_treat = np.array([np.dot(rotation_matrix, normed_df[['D1_Treat', 'mean']].values[i, :]) for i in range(len(normed_df))])
corr_df = normed_df.copy()
corr_df.loc[:, 'D1_Treat'] = new_treat[:, 0]
# corr_df.loc[:, 'mean'] = normed_df['mean'].values
corr_df.loc[:, 'mean'] = new_treat[:, 1]
fig, ax = plt.subplots()
normed_df.plot('mean', 'D1_Cont', kind='scatter', xlim=(0, 20000), ylim=(0, 20000), ax=ax, color='blue', alpha=0.4)
corr_df.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4, s=10);
ax.set_title("With TMM, linearly transformed");
plt.ylim((-500, 15000))
plt.xlim((-500, 15000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [23]:
data_copy = data.copy()[['D1_Cont', 'D1_Treat']]
In [24]:
percentiles = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 0.95, 0.99]
In [25]:
data_copy.describe(percentiles=percentiles)
Out[25]:
In [26]:
data_copy2 = data_copy[(data_copy.D1_Cont != 0) & (data_copy.D1_Treat != 0)]
In [27]:
(data_copy2.D1_Cont - data_copy2.D1_Treat).abs().describe(percentiles=percentiles)
Out[27]:
In [28]:
(data_copy2.D1_Cont - data_copy2.D1_Treat).abs().describe(percentiles=percentiles).loc['80%']
Out[28]:
In [29]:
data_copy3 = (data_copy2.D1_Cont - data_copy2.D1_Treat).abs()
In [30]:
sns.boxplot(data_copy3[data_copy3 < 500]);
In [31]:
data_copy2.head()
Out[31]:
In [32]:
# load the data container_obj
config = '../examples/example_config.ini'
config_obj = config_parser(config)
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
data = container_obj.normalize_file_pairs(data) # Single df of normalized data
#----------------------------------------------------------
data_copy = data.copy()
data_copy.loc[:, 'mean'] = data_copy.mean(axis=1)
data_copy = data_copy[(data_copy.D1_Cont != 0) & (data_copy.D1_Treat != 0)]
data_copy.loc[:, 'abs_diff'] = (data_copy.D1_Cont - data_copy.D1_Treat).abs()
cutoff = (data_copy.D1_Cont - data_copy.D1_Treat).abs().describe(percentiles=percentiles).loc['80%']
data_copy = data_copy[data_copy['abs_diff'] < cutoff]
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)
regCont.fit(data_copy['D1_Cont'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
regTreat.fit(data_copy['D1_Treat'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
#----------------------------------------------------------
normed_df = data.copy()
normed_df = normed_df[['D1_Cont', 'D1_Treat']].copy()
normed_df.loc[:, 'mean'] = normed_df.mean(axis=1)
normed_df['D1_Cont'] = normed_df['D1_Cont'] - regCont.intercept_
normed_df['D1_Treat'] = normed_df['D1_Treat'] - regTreat.intercept_
fig, ax = plt.subplots()
normed_df.plot('mean', 'D1_Cont', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='blue', alpha=0.4, label='Control')
normed_df.plot('mean', 'D1_Treat', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='red', alpha=0.4, label='Treated')
# plot regression lines, with color switch!
plt.plot([0, 8000], [0.0, regCont.coef_ * 8000], linestyle='--', color='black')
plt.plot([0, 4000], [0.0, regTreat.coef_ * 4000], linestyle='--', color='white');
plt.plot([4000, 8000], [regTreat.coef_[0] * 4000.0, regTreat.coef_[0] * 8000], linestyle='--', color='black');
ax.set_title("TMM normalized expression data");
plt.ylim((-500, 8000))
plt.xlim((-500, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [33]:
correction_theta = calc_theta(np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_))
rotation_matrix = compute_rot_mat(correction_theta)
regCont.coef_, regTreat.coef_
normed_df.head()
Out[33]:
In [34]:
new_treat = np.array([np.dot(rotation_matrix, normed_df[['D1_Cont', 'mean']].values[i, :]) for i in range(len(normed_df.values))])
rotated = pd.DataFrame(new_treat, columns=['Cont_cor', 'mean_cor'], index=normed_df.index)
fig, ax = plt.subplots()
rotated.plot('mean_cor', 'Cont_cor', kind='scatter', xlim=(0, 20000), ylim=(0, 20000), ax=ax, color='blue', alpha=0.4)
normed_df.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
ax.set_title("No TMM, linearly transformed")
plt.ylim((-1000, 8000))
plt.xlim((-1000, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [35]:
# load the data container_obj
config = '../examples/example_config.ini'
config_obj = config_parser(config)
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
data = container_obj.normalize_file_pairs(data) # Single df of normalized data
#----------------------------------------------------------
data_copy = data.copy()
data_copy.loc[:, 'mean'] = data_copy.mean(axis=1)
data_copy = data_copy[(data_copy['mean'] > 100) & (data_copy['mean'] < 500)]
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)
regCont.fit(data_copy['D1_Cont'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
regTreat.fit(data_copy['D1_Treat'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
#----------------------------------------------------------
normed_df = data.copy()
normed_df = normed_df[['D1_Cont', 'D1_Treat']].copy()
normed_df.loc[:, 'mean'] = normed_df.mean(axis=1)
normed_df['D1_Cont'] = normed_df['D1_Cont'] - regCont.intercept_
normed_df['D1_Treat'] = normed_df['D1_Treat'] - regTreat.intercept_
fig, ax = plt.subplots()
normed_df.plot('mean', 'D1_Cont', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='blue', alpha=0.4, label='Control')
normed_df.plot('mean', 'D1_Treat', kind='scatter',
xlim=(0, 5000), ylim=(0, 8000), ax=ax, color='red', alpha=0.4, label='Treated')
# plot regression lines, with color switch!
plt.plot([0, 8000], [0.0, regCont.coef_ * 8000], linestyle='--', color='black')
plt.plot([0, 4000], [0.0, regTreat.coef_ * 4000], linestyle='--', color='white');
plt.plot([4000, 8000], [regTreat.coef_[0] * 4000.0, regTreat.coef_[0] * 8000], linestyle='--', color='black');
ax.set_title("TMM normalized expression data");
plt.ylim((-500, 8000))
plt.xlim((-500, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [36]:
correction_theta = calc_theta(np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_))
rotation_matrix = compute_rot_mat(correction_theta)
In [37]:
np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_)
Out[37]:
In [79]:
normed_df.head()
Out[79]:
In [80]:
coefficients
In [ ]:
In [45]:
new_treat = np.array([np.dot(rotation_matrix, normed_df[['D1_Treat', 'mean']].values[i, :]) for i in range(len(normed_df.values))])
rotated = pd.DataFrame(new_treat, columns=['Treat_cor', 'mean_cor'], index=normed_df.index)
fig, ax = plt.subplots()
rotated.plot('mean_cor', 'Treat_cor', kind='scatter', xlim=(0, 20000), ylim=(0, 20000), ax=ax, color='blue', alpha=0.4)
normed_df.plot('mean', 'D1_Cont', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
ax.set_title("TMM, linearly transformed, full corretion")
plt.ylim((-1000, 8000))
plt.xlim((-1000, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [69]:
# load the data container_obj
config = '../examples/example_config.ini'
config_obj = config_parser(config)
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
data = container_obj.normalize_file_pairs(data) # Single df of normalized data
#----------------------------------------------------------
data_copy = data.copy()
data_copy.loc[:, 'mean'] = data_copy.mean(axis=1)
data_copy = data_copy[(data_copy['mean'] > 100) & (data_copy['mean'] < 500)]
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)
regCont.fit(data_copy['D1_Cont'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
regTreat.fit(data_copy['D1_Treat'].values.reshape(-1, 1), data_copy['mean'].values.reshape(-1, 1))
#----------------------------------------------------------
normed_df = data.copy()
normed_df = normed_df[['D1_Cont', 'D1_Treat']].copy()
normed_df.loc[:, 'mean'] = normed_df.mean(axis=1)
normed_df['D1_Cont'] = normed_df['D1_Cont'] - regCont.intercept_
normed_df['D1_Treat'] = normed_df['D1_Treat'] - regTreat.intercept_
correction_theta = calc_theta(np.squeeze(regCont.coef_), np.squeeze(regTreat.coef_))
rotation_matrix = compute_rot_mat(correction_theta)
new_treat = np.array([np.dot(rotation_matrix, normed_df[['D1_Treat', 'mean']].values[i, :]) for i in range(len(normed_df.values))])
rotated = pd.DataFrame(new_treat, columns=['Treat_cor', 'mean_cor'], index=normed_df.index)
rotated.loc[:, 'mean'] = normed_df['mean']
fig, ax = plt.subplots()
rotated.plot('mean', 'Treat_cor', kind='scatter', xlim=(0, 20000), ylim=(0, 20000), ax=ax, color='blue', alpha=0.4)
normed_df.plot('mean', 'D1_Cont', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
ax.set_title("TMM, linearly transformed")
plt.ylim((-1000, 18000))
plt.xlim((-1000, 18000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();
In [70]:
correction_theta = calc_theta()
In [76]:
coefs = list(map(float, [np.squeeze(regTreat.coef_), np.squeeze(regCont.coef_)]))
In [77]:
coefs
Out[77]:
In [78]:
calc_theta(*coefs)
Out[78]:
In [83]:
normed_df.columns[np.argmin(coefs)]
Out[83]: