I'd like to explore some different aspects of the expression profiles for the different samples. There are some additional samples besides those used for the eQTL study because for some people we sequenced multiple clones etc.
I'll generally be looking at the all of the data but I'll also look at a subset of European individuals from different families to remove some of the genetic and ancestry signal. While it's true that some family members are genetically unrelated, I will just take one person per family to avoid similarities from shared familial environment.
In [1]:
import copy
import cPickle
import os
import subprocess
import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.linalg import svd
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
outdir = os.path.join(ciepy.root, 'output',
'rna_seq_analysis')
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output',
'rna_seq_analysis')
cpy.makedir(private_outdir)
In [2]:
sns.set_context('notebook')
sns.set_style('whitegrid')
In [3]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0, squeeze=True)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
ext_meta = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211.tsv'), index_col=0,
low_memory=False)
tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv'), index_col=0,
low_memory=False)
ext_tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211_tpm.tsv'), index_col=0,
low_memory=False)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'star_logs.tsv')
star_logs = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'picard_metrics.tsv')
picard = pd.read_table(fn, index_col=0)
In [4]:
meta = rna_meta.merge(subject_meta, left_on='subject_id',
right_index=True, how='inner')
In [5]:
fig, axs = plt.subplots(2, 2)
axs = axs.flatten()
ax = axs[0]
k = 'Number of input reads'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[1]
k = 'Uniquely mapped reads number'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[2]
k = 'Uniquely mapped reads %'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[3]
k = 'Average mapped length'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
plt.tight_layout()
In [6]:
n = star_logs['Number of input reads'].mean()
print('Average of {:.2f} million read pairs per sample'.format(n / 10**6))
In [7]:
star_logs['Uniquely mapped reads %'].min()
Out[7]:
In [8]:
star_logs['Uniquely mapped reads %'].median()
Out[8]:
In [9]:
wgs_meta.cell.value_counts()
Out[9]:
In [39]:
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].hist();
In [41]:
picard.PERCENT_DUPLICATION.median()
Out[41]:
In [40]:
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].describe()
Out[40]:
In [10]:
s = (tpm == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');
We can see overal that there are a fair number of genes that are not expressed in any samples.
In [11]:
s = (tpm[gene_info.ix[tpm.index, 'gene_type'] == 'protein_coding'] == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each protein coding gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');
In [12]:
plur_markers = ['POU5F1', 'SOX2', 'NANOG', 'ZFP42', 'LIN28A']
diff_markers = ['T', 'EOMES', 'SOX17', 'FOXA2', 'GATA4', 'HAND1',
'CDX2', 'PAX6', 'SOX1', 'EN1']
def get_gene_id(x):
return gene_info[gene_info.gene_name == x].index[0]
plur_markers = pd.Series(plur_markers, index=[get_gene_id(x) for x in plur_markers])
diff_markers = pd.Series(diff_markers, index=[get_gene_id(x) for x in diff_markers])
url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
scorecard = pd.read_excel(url)
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard[scorecard['class'].apply(lambda x: x in ['Mesoderm', 'Pluri'])]
tpm_all = pd.concat([tpm, ext_tpm], axis=1)
In [13]:
tdf = np.log10(tpm_all.ix[scorecard.index].T + 1)
tdf = tdf - tdf.mean()
tdf = tdf / tdf.std()
In [14]:
cs = pd.Series(dict(zip(list(set(ext_meta.cell_type)), sns.color_palette('colorblind')[3:6])))
rc = ([sns.color_palette('colorblind')[2]] * tpm.shape[1]) + list(cs[ext_meta.cell_type])
cs = pd.Series(dict(zip(list(set(scorecard['class'])), sns.color_palette("Set2", 7))))
cc = cs[scorecard['class']]
sns.clustermap(tdf, xticklabels=[], yticklabels=[], col_colors=cc, row_colors=rc);#, col_cluster=False);
In [15]:
# Filter for robustly expressed genes and take log.
tpm_f = tpm[(tpm == 0).sum(axis=1) == 0]
log_tpm = np.log10(tpm_f + 1)
# Mean center.
log_tpm_c = (log_tpm.T - log_tpm.mean(axis=1)).T
# Variance normalize.
log_tpm_n = (log_tpm_c.T / log_tpm_c.std(axis=1)).T
In [16]:
cov = log_tpm.std(axis=1) / log_tpm.mean(axis=1)
cov.sort_values(ascending=False, inplace=True)
cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');
In [17]:
res = cpb.analysis.SVD(log_tpm, scale_variance=True)
In [18]:
res.plot_variance_explained(xtick_start=10, xtick_spacing=20, cumulative=False)
In [19]:
res.plot_variance_explained(num_pc=30)
In [20]:
pc_anova = res.pc_anova(meta[['sex', 'ethnicity_group', 'sequence_id']])
pc_anova.pvalue
Out[20]:
In [21]:
fig, ax = res.plot_pc_scatter(
'PC1', 'PC2',
color=meta.ix[res.v.index, 'sequence_id'],
color_name='Flowcell')
In [22]:
fig, ax = res.plot_pc_scatter(
'PC2', 'PC3',
color=meta.ix[res.v.index, 'sequence_id'],
color_name='Flowcell')
In [23]:
fig, ax = res.plot_pc_scatter(
'PC3', 'PC4',
color=meta.ix[res.v.index, 'sequence_id'],
color_name='Flowcell')
The ANOVA and PC plots show that the different batches on the flowcells definitely have an effect on the expression values. The flowcell with ID 23 was done a month or so after the other three flowcells (23 had the samples that failed QC from the first three flowcells) so it's not surprising that it's more different.
In [24]:
a = meta.dropna(subset=['family_id']).drop_duplicates(subset=['family_id'])
b = meta[meta.family_id.isnull()]
euro_no_fam = pd.concat([a, b])
In [25]:
# Filter for robustly expressed genes and take log.
subset_tpm_f = tpm.ix[(tpm == 0).sum(axis=1) == 0, euro_no_fam.index]
subset_log_tpm = np.log10(subset_tpm_f + 1)
# Mean center.
subset_log_tpm_c = (subset_log_tpm.T - subset_log_tpm.mean(axis=1)).T
# Variance normalize.
subset_log_tpm_n = (subset_log_tpm_c.T / subset_log_tpm_c.std(axis=1)).T
In [26]:
subset_cov = subset_log_tpm.std(axis=1) / subset_log_tpm.mean(axis=1)
subset_cov.sort_values(ascending=False, inplace=True)
subset_cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');
In [27]:
subset_svd = cpb.analysis.SVD(subset_log_tpm, scale_variance=True)
In [28]:
subset_svd.plot_variance_explained(xtick_start=10, xtick_spacing=10, cumulative=False)
In [29]:
subset_svd.plot_variance_explained(num_pc=30)
In [30]:
fig, ax = subset_svd.plot_pc_scatter(
'PC1', 'PC2',
color=meta.ix[subset_svd.v.index, 'sequence_id'],
marker=meta.ix[subset_svd.v.index, 'sex'],
color_name='Flowcell',
marker_name='Sex')
In [31]:
subset_svd_500 = cpb.analysis.SVD(subset_log_tpm_n.ix[subset_cov.head(500).index])
In [32]:
fig, ax = subset_svd_500.plot_pc_scatter(
'PC1', 'PC2',
color=meta.ix[subset_svd.v.index, 'sequence_id'],
marker=meta.ix[subset_svd.v.index, 'sex'],
color_name='Flowcell',
marker_name='Sex')
TODO: Working here. Update stuff below.
In [33]:
t = subset_log_tpm_n.ix[subset_cov.head(500).index]
ct = t.corr(method='spearman')
cg = sns.clustermap(
ct, xticklabels=False, yticklabels=False,
row_colors=color_meta.ix[ct.index, ['sex']].T.values,
col_colors=color_meta.ix[ct.index, ['sequence_id']].T.values)
In [ ]:
tdf = subset_log_tpm_n.ix[subset_cov.head(500).index]
fn = os.path.join(outdir, 'subset_sparse_pca.tsv')
p = os.path.join(outdir, 'subset_sparse_pca.pickle')
if not os.path.exists(fn):
from sklearn import decomposition
sparse_pca = decomposition.SparsePCA(n_components=5, n_jobs=30)
sparse_pca.fit(tdf.T)
ind = ['PC{}'.format(x) for x in
range(1, sparse_pca.components_.shape[0] + 1)]
sparse_components = pd.DataFrame(sparse_pca.components_,
columns=tdf.index,
index=ind).T
sparse_components.to_csv(fn, sep='\t')
cPickle.dump(sparse_pca, open(p, 'wb'))
else:
sparse_componenets = pd.read_table(fn, index_col=0, header=0)
sparse_pca = cPickle.load(open(p, 'rb'))
In [ ]:
#plt.scatter(subset_svd_500.u['PC1'], sparse_pca.components_[0], lw=0, alpha=0.8)
ax = sns.jointplot(subset_svd_500.u['PC1'], sparse_pca.components_[0],
stat_func=None, alpha=0.25)
plt.xlabel('PCA loading')
plt.ylabel('Sparse PCA loading')
ax.set_axis_labels(xlabel='Distance in kb',
ylabel='$-\log_{10}$ $p$-value')
plt.tight_layout()
#plt.title('PCA vs. sparse PCA first eigenvector');
In [ ]:
sparse_pca.components_[0].max()
In [ ]:
ax = sns.jointplot(x=subset_svd_500.u['PC1'].values, y=sparse_pca.components_[0],
stat_func=None, alpha=0.5)
ax.set_axis_labels(xlabel='PCA eigenvector',
ylabel='Sparse PCA eigenvector')
plt.tight_layout()
In [ ]:
genes = [x.split('.')[0] for x in subset_cov.index]
sig = np.array([False] * len(genes))
sig[0:500] = True
subset_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)
In [ ]:
genes = [x.split('.')[0] for x in subset_cov.index]
sig = pd.Series(np.array([False] * len(genes)), index=subset_cov.index)
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
sig[se[se != 0].index] = True
spc1_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig.values)
In [ ]:
spc1_var_go_results.head()
In [ ]:
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
t = subset_log_tpm_n.ix[se[se != 0].index]
cg = sns.clustermap(
t, xticklabels=False, yticklabels=False,
col_colors=color_meta.ix[ct.index, ['sex', 'sequence_id']].T.values)