I have gone through your code and see a few things that I think may be problems. I have tried to point them out below.
In [1]:
# Import various packages
import pandas as pd
import sas7bdat as SAS
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
In [2]:
# Plotting functions for easy plots
def msKDE(dat, header, ax, xlim=(-10, 10)):
ax.axvline(0, color='r')
ax.set_title(header)
for name, grp in dat.groupby('mating_status'):
grp[header].plot(kind='kde', label=name, ax=ax, xlim=xlim, legend=True)
In [3]:
# Import the SAS DataSet that you created
fname = '../../sas_data/cis_est_v11.sas7bdat'
sasDat = SAS.SAS7BDAT(fname)
dat = sasDat.to_data_frame()
dat.head()
Out[3]:
In [4]:
# Get a random set of 100 Fusion IDs for plotting
## Get list of all fusions
fusions = np.array(list(set(dat['fusion_id'].tolist())))
## Randomly select 100 fusions
np.random.seed(12345)
fus100 = np.random.choice(fusions, size=100, replace=False)
print(fus100)
In [5]:
# Filter dataset to only have 100 Fusions
dat100 = dat[dat['fusion_id'].isin(fus100)]
print(dat100.shape)
dat100.head(10)
Out[5]:
In [6]:
# Looking at all Data
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
msKDE(dat, 'C_t', ax1)
msKDE(dat, 'c_i', ax2)
In [7]:
# Looking at 100 fusions only
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
msKDE(dat100, 'C_t', ax1, xlim=(-5, 5))
msKDE(dat100, 'c_i', ax2, xlim=(-5, 5))
In [8]:
# Mated
groups = dat[dat['mating_status'] == 'M'].groupby('line')
fig, axes = plt.subplots(7, 7, figsize=(20, 20), sharey=False, sharex=False)
fig.suptitle('Mated')
axes = np.ravel(axes)
for i, (line, grp) in enumerate(groups):
grp['C_t'].plot(kind='kde', ax=axes[i])
axes[i].set_title(line)
axes[i].axvline(0, color='r')
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [9]:
# Virgin
groups = dat[dat['mating_status'] == 'V'].groupby('line')
fig, axes = plt.subplots(7, 7, figsize=(20, 20), sharey=False, sharex=False)
fig.suptitle('Virgin')
axes = np.ravel(axes)
for i, (line, grp) in enumerate(groups):
grp['C_t'].plot(kind='kde', ax=axes[i])
axes[i].set_title(line)
axes[i].axvline(0, color='r')
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
Now if I plot c_i vs q5_mean_theta, I would expect a linear relationship between cis-line and q5 mean theta. The majority of fusions show a negative relationship between these two values. For example, I think Mated F672_SI looks good, but Mated F32473_SI is not good.
I could explain away the odd looking fusions as having a more influenctial trans or C_t effect.
In [10]:
# Mated
groups = dat100[dat100['mating_status'] == 'M'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='c_i', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [11]:
# Virgin
groups = dat100[dat100['mating_status'] == 'V'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='c_i', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
$T_{t1} = 2(\frac{\sum E_t}{n} - \frac{\sum E_l}{n} - C_t)$
$T_{i1a} = 2(E_l - \frac{\sum E_l}{n} - C_i) - T_{t1}$
I have a few concerns with this method. I don't see the reasoning for using $\frac{\sum E_l}{n}$ as the allele mean. I am also concerned by the fact that $T_{t1}$ is always $\sim 0$
In [12]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
msKDE(dat, 'T_t_1', ax)
That being said, T_i_1a behaves as I would expect.
In [13]:
# Looking at all Data
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
msKDE(dat, 'T_i_1a', ax)
In [14]:
# Looking at 100 fusions only
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
msKDE(dat100, 'T_t_1', ax1, xlim=(-5, 5))
msKDE(dat100, 'T_i_1a', ax2, xlim=(-5, 5))
In [15]:
# Mated
groups = dat100[dat100['mating_status'] == 'M'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_1a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [16]:
# Virgin
groups = dat100[dat100['mating_status'] == 'V'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_1a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [17]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
msKDE(dat, 'T_t_2', ax1)
msKDE(dat, 'T_i_2a', ax2)
In [18]:
# Mated
groups = dat100[dat100['mating_status'] == 'M'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_2a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [19]:
# Virgin
groups = dat100[dat100['mating_status'] == 'V'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_2a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [20]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
msKDE(dat, 'T_t_3', ax1)
msKDE(dat, 'T_i_3a', ax2)
In [21]:
# Mated
groups = dat100[dat100['mating_status'] == 'M'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_3a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [22]:
# Virgin
groups = dat100[dat100['mating_status'] == 'V'].groupby('fusion_id')
fig, axes = plt.subplots(10, 10, figsize=(20, 20), sharey=False, sharex=False)
axes = np.ravel(axes)
for i, (fus, grp) in enumerate(groups):
sns.regplot(x='T_i_3a', y='q5_mean_theta', data=grp, ax=axes[i], color='DarkBlue', ci=None)
axes[i].set_title(fus)
axes[i].xaxis.set_visible(False)
axes[i].yaxis.set_visible(False)
In [ ]: