One surprising finding from the adding genes SEM was that we were not able to add genes to the DSPR data. This suggests that the DSPR SEM is overfit. Here I want to explore the covariance structure of both data sets to see if I can narrow down the possibilities.
There are two big differences between the CEGS and DSPR populations:
1) The technology used (RNA-seq vs microarray)
2) The structure of the population (Tester-cross with 75 founders vs RIL-cross with 15 diverse founders)
By exploring the variance patterns I hope to see what the difference look like in the data, and potentially figure out if these can be separated with the data that I have in hand.
In [ ]:
%run 'ipython_startup.py'
In [ ]:
import seaborn as sns
In [ ]:
dspr = pd.read_csv(os.path.join(PROJ, 'analysis_output/mmc/dsrp_sex_det_genes_for_mmc.csv'), index_col='_NAME_')
cegs = pd.read_csv(os.path.join(PROJ, 'analysis_output/mmc/cegsV_sex_det_gene_for_mmc.csv'), index_col='_NAME_')
In [ ]:
dspr.drop('Rm62', inplace=True)
How do genes covary in the sex hierarchy? Look at covariance and correlation structure among genes in SD.
$cov(x, y) = \frac{\sum^n_{i=1} (x - \bar x) (y - \bar y)}{N-1}$
The DSPR data has a more extreme pattern of covariance. Both the DSPR and CEGS show strong relationship among the Yolk proteins.
In [ ]:
# Plot of Covaraince Matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 10))
d = sns.heatmap(dspr.T.cov(), vmin=-1, vmax=1, square=True, ax=ax1, cbar=False)
ax1.set_title('DSPR')
ax1.set_xlabel(''); ax1.set_ylabel('')
c = sns.heatmap(cegs.T.cov(), vmin=-1, vmax=1, square=True, ax=ax2, cbar=False)
ax2.set_title('CEGS')
ax2.set_xlabel(''); ax2.set_ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(PROJ, 'analysis_output/correlation/covariance_dspr_cegs.png'), dpi=300)
In [ ]:
# DSPR plot of distribution of covariance
p = dspr.T.cov().plot(kind='hist', subplots=True, layout=(5, 4), sharex=True,
sharey=True, figsize=(8, 8), rot=90,
title='DSPR Distribution of Covariances')
#plt.tight_layout(rect=[0, 0, 1, .98])
In [ ]:
# CEGS plot of distribution of covariance
p = cegs.T.cov().plot(kind='hist', subplots=True, layout=(5, 4), sharex=True,
sharey=True, figsize=(8, 8), rot=90,
title='CEGS Distribution of Covariances')
plt.tight_layout(rect=[0, 0, 1, .98])
In [ ]:
# Plot of Correlation Matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 10))
d = sns.heatmap(dspr.T.corr(), vmin=-1, vmax=1, square=True, ax=ax1, cbar=False)
ax1.set_title('DSPR')
ax1.set_xlabel(''); ax1.set_ylabel('')
c = sns.heatmap(cegs.T.corr(), vmin=-1, vmax=1, square=True, ax=ax2, cbar=False)
ax2.set_title('CEGS')
ax2.set_xlabel(''); ax2.set_ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(PROJ, 'analysis_output/correlation/correlation_dspr_cegs.png'), dpi=300)
In [ ]:
dspr.T.var()
In [ ]:
cegs.T.var()
In [ ]:
# DSPR plot of distribution of correlation
p = dspr.T.corr().plot(kind='hist', subplots=True, layout=(5, 4), sharex=True,
sharey=True, figsize=(8, 8), rot=90,
title='DSPR Distribution of Correlation')
#plt.tight_layout(rect=[0, 0, 1, .98])
In [ ]:
# CEGS plot of distribution of correlation
p = cegs.T.corr().plot(kind='hist', subplots=True, layout=(5, 4), sharex=True,
sharey=True, figsize=(8, 8), rot=90,
title='CEGS Distribution of Correlation')
plt.tight_layout(rect=[0, 0, 1, .98])
In [ ]:
print dspr.T.corr().max().max(), dspr.T.corr().min().min()
print cegs.T.corr().max().max(), cegs.T.corr().min().min()
covMax = max(dspr.T.cov().max().max(), cegs.T.cov().max().max())
covMax
In [ ]:
# Plot of Covaraince Matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 10))
d = sns.heatmap(dspr.T.cov(), vmin=covMin, vmax=covMax, square=True, ax=ax1, cbar=False)
ax1.set_title('DSPR')
c = sns.heatmap(cegs.T.cov(), vmin=covMin, vmax=covMax, square=True, ax=ax2, cbar=False)
ax2.set_title('CEGS')
plt.tight_layout()
plt.savefig(os.path.join(PROJ, 'analysis_output/correlation/covariance_dspr_cegs.png'), dpi=300)
In [ ]:
print dspr.T.corr().max().max(), dspr.T.corr().min().min()
print cegs.T.corr().max().max(), cegs.T.corr().min().min()
In [ ]: