Experiments to detect heteroskedasticity


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from random import randint as rand
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import euclidean_distances

from scipy.linalg import svd

from SeqPy.container.data_container import DataContainer
from seqpyplot.parsers.config_parser import config_parser
from pathlib import Path

from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 10)

pd.options.mode.chained_assignment = None


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-c77fa420992d> in <module>()
     11 from scipy.linalg import svd
     12 
---> 13 from seqpyplot.container.data_container import DataContainer
     14 from seqpyplot.parsers.config_parser import config_parser
     15 from pathlib import Path

ModuleNotFoundError: No module named 'seqpyplot'

Helper functions

Detect Heteroskedasticity


In [9]:
config = '../examples/example_config.ini'
config_obj = config_parser(config)

In [10]:
config_obj


Out[10]:
<configparser.ConfigParser at 0x1a0880a588>

In [11]:
container_obj.file_pairs


Out[11]:
[('D1_Cont', 'D1_Treat'), ('D2_Cont', 'D2_Treat'), ('D3_Cont', 'D3_Treat')]

In [43]:
# load the data container_obj
container_obj = DataContainer(config_obj)
data, ercc_data = container_obj.parse_input()
data = container_obj.normalize_file_pairs(data) # Single df of normalized data

In [44]:
cols = data.columns
cols


Out[44]:
Index(['D1_Cont', 'D2_Cont', 'D3_Cont', 'D1_Treat', 'D2_Treat', 'D3_Treat'], dtype='object')

In [96]:
df = data[['D3_Cont', 'D3_Treat']]
df.loc[:, 'mean'] = df.mean(axis=1)

In [97]:
d1 = df[['D3_Cont', 'mean']]
d2 = df[['D3_Treat', 'mean']]

In [98]:
fig, ax = plt.subplots()
d1.plot('mean', 'D3_Cont', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='blue', alpha=0.4)
d2.plot('mean', 'D3_Treat', kind='scatter', xlim=(0, 5000), ylim=(0, 5000), ax=ax, color='red', alpha=0.4);
ax.set_title("Raw unnormalized data");



In [48]:
# Quick reset for this cell
d1 = df[['D1_Cont', 'mean']]
d2 = df[['D1_Treat', 'mean']]

# define regression objects
regCont = LinearRegression(fit_intercept=True)
regTreat = LinearRegression(fit_intercept=True)

# fit regression
regCont.fit(d1['D1_Cont'].values.reshape(-1, 1), d1['mean'].values.reshape(-1, 1))
regTreat.fit(d2['D1_Treat'].values.reshape(-1, 1), d1['mean'].values.reshape(-1, 1))

print(regCont.coef_, regCont.intercept_)
print(regTreat.coef_, regTreat.intercept_)

# Correct bias
d1['D1_Cont'] = d1['D1_Cont'] - regCont.intercept_
d2['D1_Treat'] = d2['D1_Treat'] - regTreat.intercept_

# Plot regression lines
fig, ax = plt.subplots()
d1.plot('mean', 'D1_Cont', kind='scatter', ax=ax, color='blue', alpha=0.4)
d2.plot('mean', 'D1_Treat', kind='scatter', xlim=(0, 8000), ylim=(0, 8000), ax=ax, color='red',  alpha=0.4)
plt.plot([0, 8000], [0.0, regCont.coef_ * 8000], linestyle='--', color='black')
plt.plot([0, 8000], [0.0, regTreat.coef_ * 8000], linestyle='--', color='black');
ax.set_title("bias corrected, with best fit lines");

plt.ylim((-500, 8000))
plt.xlim((-500, 8000));
plt.axvline(0, linestyle='--')
plt.axhline(0, linestyle='--')
plt.legend();


No handles with labels found to put in legend.
[[1.09646016]] [-1.66985095]
[[0.91183697]] [5.21303275]

In [ ]:


In [49]:
plt.scatter(d1['D1_Cont'], d2['D1_Treat'])


Out[49]:
<matplotlib.collections.PathCollection at 0x1a095c79b0>

In [51]:
fig, ax = plt.subplots()
x1 = d1['mean']
y1 = d1['D1_Cont'].apply(lambda x: np.log2(x))
x2 = d2['mean']
y2 = d2['D1_Treat'].apply(lambda x: np.log2(x))

ax.scatter(x1, y1, c='red', alpha=0.3)
ax.scatter(x2, y2, c='blue', alpha=0.3)
ax.set_title("Raw unnormalized data");
ax.set_xlim((0, 2500))


Out[51]:
(0, 2500)

In [53]:
container_obj.file_pairs


Out[53]:
[('D1_Cont', 'D1_Treat'), ('D2_Cont', 'D2_Treat'), ('D3_Cont', 'D3_Treat')]

In [58]:
# results = list()
# for pair in container_obj.file_pairs:
#     df = data[list(pair)]
#     mean = df.mean(axis=1)
#     df = df.sub(mean)
#     results.append(df)

centeral all control observations on zero


In [120]:
results = list()
for pair in container_obj.file_pairs:
    df = data[list(pair)]
    df = df.apply(lambda x: x - df[df.columns[0]])
    results.append(df)
dat = pd.concat(results, axis=1)
dat = dat[[x[1] for x in container_obj.file_pairs]]

In [127]:
dat.loc[:, 'std'] = dat.std(axis=1)

In [128]:
dat.head()


Out[128]:
D1_Treat D2_Treat D3_Treat std
gene
0610005C13Rik 13.743831 -2.832119 -3.893247 9.890691
0610007P14Rik 27.891812 62.189405 -24.912972 43.877660
0610009B22Rik 68.121425 24.014577 12.939429 29.192252
0610009L18Rik 4.786526 -22.801595 -2.839870 14.246267
0610009O20Rik 118.126123 173.379152 -18.884659 98.986720

In [119]:
dat.loc['Mir6957']


Out[119]:
D1_Treat    0.0
D2_Treat    0.0
D3_Treat    0.0
std         0.0
Name: Mir6957, dtype: float64

In [145]:
with open('dataout.txt', 'w+') as outfile:
    for i in top_std_genes:
        outfile.write(str(i) + '\n')

In [139]:
dat.sort_values(by=['std'], ascending=False)['std'].std()


Out[139]:
595.4595733396428

In [144]:
top_std_genes = dat[dat.sort_values(by=['std'], ascending=False)['std'] >  dat.sort_values(by=['std'], ascending=False)['std'].std()].sort_values(by=['std'], ascending=False).index


/Users/pgradie/miniconda-3.9.1/envs/py3/lib/python3.6/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.

In [146]:
len(top_std_genes)


Out[146]:
835

In [117]:
dat['std'].plot(kind='hist')


Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a160c5a58>

In [ ]:


In [89]:
np.random.randint(0, 100, 20).std()


Out[89]:
26.298811760229775

In [ ]:
dat

In [103]:
dat.head(20).std(axis=1
                 
                )


Out[103]:
gene
0610005C13Rik     6.385317
0610007P14Rik    30.193757
0610009B22Rik    26.625237
0610009L18Rik     9.781619
0610009O20Rik    79.979674
0610010B08Rik     1.159945
0610010F05Rik    96.921546
0610010K14Rik    37.660123
0610011F06Rik    19.747589
0610012G03Rik    38.916708
0610030E20Rik    36.409759
0610031O16Rik     0.408248
0610037L13Rik    16.739981
0610038B21Rik     7.402650
0610039H22Rik     4.405756
0610039K10Rik     2.726548
0610040B10Rik     2.397858
0610040F04Rik     0.517091
0610040J01Rik     5.854774
0610043K17Rik     5.055931
dtype: float64

In [ ]: