RNA-Seq Analysis

I'd like to explore some different aspects of the expression profiles for the different samples. There are some additional samples besides those used for the eQTL study because for some people we sequenced multiple clones etc.

I'll generally be looking at the all of the data but I'll also look at a subset of European individuals from different families to remove some of the genetic and ancestry signal. While it's true that some family members are genetically unrelated, I will just take one person per family to avoid similarities from shared familial environment.


In [1]:
import copy
import cPickle
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.linalg import svd
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf

import cardipspy as cpy
import ciepy

%matplotlib inline
%load_ext rpy2.ipython

outdir = os.path.join(ciepy.root, 'output',
                      'rna_seq_analysis')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'rna_seq_analysis')
cpy.makedir(private_outdir)

In [2]:
sns.set_context('notebook')
sns.set_style('whitegrid')

In [3]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0, squeeze=True)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)

ext_meta = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211.tsv'), index_col=0,
                         low_memory=False)

tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv'), index_col=0,
                                 low_memory=False)
ext_tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211_tpm.tsv'), index_col=0,
                        low_memory=False)

gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'star_logs.tsv')
star_logs = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'picard_metrics.tsv')
picard = pd.read_table(fn, index_col=0)

In [4]:
meta = rna_meta.merge(subject_meta, left_on='subject_id', 
                      right_index=True, how='inner')
# Get unrelated European subjects. tdf = subject_meta[subject_meta.ethnicity_group == 'European'] a = tdf[tdf.family_id.isnull()] b = tdf.dropna(subset=['family_id']) b = b.drop_duplicates(subset=['family_id']) euro_no_fam_subject = pd.concat([a, b]) print('Number of unrelated Europeans: {}'.format(euro_no_fam_subject.shape[0]))
t = rna_meta.sort(columns='in_eqtl', ascending=False).drop_duplicates(subset='subject_id') euro_no_fam = euro_no_fam_subject.merge(t, left_index=True, right_on='subject_id')
meta = rna_meta.merge(subject_meta, left_on='subject_id', right_index=True, how='outer') color_meta = pd.DataFrame(index=meta.index) color_meta['sex'] = [(1, 1, 1)] * color_meta.shape[0] ind = meta[meta.sex == 'M'].index color_meta.ix[ind, 'sex'] = [sns.color_palette()[0]] * ind.shape[0] cdict = dict(zip(set(meta.sequence_id), sns.color_palette()[1:4])) color_meta['sequence_id'] = [cdict[x] for x in meta.sequence_id] cdict = dict(zip(set(meta.estimated_ethnicity), sns.color_palette("Set2", 5))) color_meta['estimated_ethnicity'] = [cdict[x] for x in meta.estimated_ethnicity] cdict = dict(zip(set(meta.family_id), len(set(meta.family_id)) * [(1, 1, 1)])) for i, k in enumerate(meta.family_id.value_counts().head(8).index): cdict[k] = sns.color_palette('Set1', 8)[i] cdict[np.nan] = (1, 1, 1) color_meta['family_id'] = [cdict[x] for x in meta.family_id]

STAR Logs


In [5]:
fig, axs = plt.subplots(2, 2)
axs = axs.flatten()
ax = axs[0]
k = 'Number of input reads'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[1]
k = 'Uniquely mapped reads number'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[2]
k = 'Uniquely mapped reads %'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[3]
k = 'Average mapped length'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
plt.tight_layout()



In [6]:
n = star_logs['Number of input reads'].mean()
print('Average of {:.2f} million read pairs per sample'.format(n / 10**6))


Average of 21.59 million read pairs per sample

In [7]:
star_logs['Uniquely mapped reads %'].min()


Out[7]:
85.969999999999999

In [8]:
star_logs['Uniquely mapped reads %'].median()


Out[8]:
91.02

In [9]:
wgs_meta.cell.value_counts()


Out[9]:
Blood         255
Fibroblast     19
Name: cell, dtype: int64

Picard Metrics


In [39]:
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].hist();



In [41]:
picard.PERCENT_DUPLICATION.median()


Out[41]:
0.158586

In [40]:
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].describe()


Out[40]:
MEDIAN_INSERT_SIZE PERCENT_DUPLICATION MEDIAN_5PRIME_TO_3PRIME_BIAS
count 250.000000 250.000000 248.000000
mean 286.136000 0.159617 0.871093
std 21.356166 0.022173 0.055770
min 223.000000 0.103577 0.547427
25% 271.000000 0.145569 0.848672
50% 287.500000 0.158586 0.878228
75% 300.750000 0.171944 0.903255
max 352.000000 0.238257 0.999235

Expression Distribution


In [10]:
s = (tpm == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');


250    19020
0      15313
dtype: int64

We can see overal that there are a fair number of genes that are not expressed in any samples.


In [11]:
s = (tpm[gene_info.ix[tpm.index, 'gene_type'] == 'protein_coding'] == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each protein coding gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');


0      13524
250      906
dtype: int64

Protein coding genes are highly enriched for being expressed in all samples.

Pluripotency gene expression


In [12]:
plur_markers = ['POU5F1', 'SOX2', 'NANOG', 'ZFP42', 'LIN28A']
diff_markers = ['T', 'EOMES', 'SOX17', 'FOXA2', 'GATA4', 'HAND1', 
                'CDX2', 'PAX6', 'SOX1', 'EN1']
def get_gene_id(x):
    return gene_info[gene_info.gene_name == x].index[0]
plur_markers = pd.Series(plur_markers, index=[get_gene_id(x) for x in plur_markers])
diff_markers = pd.Series(diff_markers, index=[get_gene_id(x) for x in diff_markers])

url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
scorecard = pd.read_excel(url)
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard[scorecard['class'].apply(lambda x: x in ['Mesoderm', 'Pluri'])]

tpm_all = pd.concat([tpm, ext_tpm], axis=1)

In [13]:
tdf = np.log10(tpm_all.ix[scorecard.index].T + 1)
tdf = tdf - tdf.mean()
tdf = tdf / tdf.std()

In [14]:
cs = pd.Series(dict(zip(list(set(ext_meta.cell_type)), sns.color_palette('colorblind')[3:6])))
rc = ([sns.color_palette('colorblind')[2]] * tpm.shape[1]) + list(cs[ext_meta.cell_type])
cs = pd.Series(dict(zip(list(set(scorecard['class'])), sns.color_palette("Set2", 7))))
cc = cs[scorecard['class']]

sns.clustermap(tdf, xticklabels=[], yticklabels=[], col_colors=cc, row_colors=rc);#, col_cluster=False);


All samples

Variable Genes

I'm interested in what genes are variable among all samples


In [15]:
# Filter for robustly expressed genes and take log.
tpm_f = tpm[(tpm == 0).sum(axis=1) == 0]
log_tpm = np.log10(tpm_f + 1)
# Mean center.
log_tpm_c = (log_tpm.T - log_tpm.mean(axis=1)).T
# Variance normalize.
log_tpm_n = (log_tpm_c.T / log_tpm_c.std(axis=1)).T

In [16]:
cov = log_tpm.std(axis=1) / log_tpm.mean(axis=1)
cov.sort_values(ascending=False, inplace=True)
cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');


genes = [x.split('.')[0] for x in cov.index] sig = np.array([False] * len(genes)) sig[0:1000] = True var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)
var_go_results.head()

Clustering

#c = log_tpm_n.corr(method='spearman') cg = sns.clustermap( c, xticklabels=False, yticklabels=False, row_colors=color_meta.ix[c.index, ['estimated_ethnicity', 'sex', 'sequence_id']].T.values, col_colors=color_meta.ix[c.index, ['family_id']].T.values)
t = log_tpm_n.ix[cov.head(500).index] ct = t.corr(method='spearman') cg = sns.clustermap( ct, xticklabels=False, yticklabels=False, row_colors=color_meta.ix[ct.index, ['estimated_ethnicity', 'sex', 'sequence_id']].T.values, col_colors=color_meta.ix[ct.index, ['family_id']].T.values)

SVD


In [17]:
res = cpb.analysis.SVD(log_tpm, scale_variance=True)

In [18]:
res.plot_variance_explained(xtick_start=10, xtick_spacing=20, cumulative=False)



In [19]:
res.plot_variance_explained(num_pc=30)



In [20]:
pc_anova = res.pc_anova(meta[['sex', 'ethnicity_group', 'sequence_id']])
pc_anova.pvalue


Out[20]:
PC1 PC2 PC3 PC4 PC5
sex 0.358010 1.514248e-01 8.814301e-01 0.019714 0.884173
ethnicity_group 0.174594 9.031497e-01 2.686777e-01 0.358136 0.335773
sequence_id 0.027947 4.124048e-26 3.291938e-07 0.000669 0.989441

In [21]:
fig, ax = res.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')


/frazer01/home/cdeboever/software/anaconda/envs/cie/lib/python2.7/site-packages/matplotlib/axes/_axes.py:492: UserWarning: You have mixed positional and keyword arguments, some input will be discarded.
  warnings.warn("You have mixed positional and keyword "

In [22]:
fig, ax = res.plot_pc_scatter(
    'PC2', 'PC3',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')



In [23]:
fig, ax = res.plot_pc_scatter(
    'PC3', 'PC4',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')


The ANOVA and PC plots show that the different batches on the flowcells definitely have an effect on the expression values. The flowcell with ID 23 was done a month or so after the other three flowcells (23 had the samples that failed QC from the first three flowcells) so it's not surprising that it's more different.

Subset of Samples


In [24]:
a = meta.dropna(subset=['family_id']).drop_duplicates(subset=['family_id'])
b = meta[meta.family_id.isnull()]
euro_no_fam = pd.concat([a, b])

In [25]:
# Filter for robustly expressed genes and take log.
subset_tpm_f = tpm.ix[(tpm == 0).sum(axis=1) == 0, euro_no_fam.index]
subset_log_tpm = np.log10(subset_tpm_f + 1)
# Mean center.
subset_log_tpm_c = (subset_log_tpm.T - subset_log_tpm.mean(axis=1)).T
# Variance normalize.
subset_log_tpm_n = (subset_log_tpm_c.T / subset_log_tpm_c.std(axis=1)).T

In [26]:
subset_cov = subset_log_tpm.std(axis=1) / subset_log_tpm.mean(axis=1)
subset_cov.sort_values(ascending=False, inplace=True)
subset_cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');



In [27]:
subset_svd = cpb.analysis.SVD(subset_log_tpm, scale_variance=True)

In [28]:
subset_svd.plot_variance_explained(xtick_start=10, xtick_spacing=10, cumulative=False)



In [29]:
subset_svd.plot_variance_explained(num_pc=30)



In [30]:
fig, ax = subset_svd.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[subset_svd.v.index, 'sequence_id'],
    marker=meta.ix[subset_svd.v.index, 'sex'],
    color_name='Flowcell',
    marker_name='Sex')



In [31]:
subset_svd_500 = cpb.analysis.SVD(subset_log_tpm_n.ix[subset_cov.head(500).index])

In [32]:
fig, ax = subset_svd_500.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[subset_svd.v.index, 'sequence_id'],
    marker=meta.ix[subset_svd.v.index, 'sex'],
    color_name='Flowcell',
    marker_name='Sex')


TODO: Working here. Update stuff below.


In [33]:
t = subset_log_tpm_n.ix[subset_cov.head(500).index]
ct = t.corr(method='spearman')
cg = sns.clustermap(
    ct, xticklabels=False, yticklabels=False,
    row_colors=color_meta.ix[ct.index, ['sex']].T.values,
    col_colors=color_meta.ix[ct.index, ['sequence_id']].T.values)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-3f9f674290ef> in <module>()
      3 cg = sns.clustermap(
      4     ct, xticklabels=False, yticklabels=False,
----> 5     row_colors=color_meta.ix[ct.index, ['sex']].T.values,
      6     col_colors=color_meta.ix[ct.index, ['sequence_id']].T.values)

NameError: name 'color_meta' is not defined
genes = [x.split('.')[0] for x in subset_cov.index] sig = np.array([False] * len(genes)) sig[0:500] = True subset_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)

In [ ]:
tdf = subset_log_tpm_n.ix[subset_cov.head(500).index]
fn = os.path.join(outdir, 'subset_sparse_pca.tsv')
p = os.path.join(outdir, 'subset_sparse_pca.pickle')
if not os.path.exists(fn):
    from sklearn import decomposition
    sparse_pca = decomposition.SparsePCA(n_components=5, n_jobs=30)
    sparse_pca.fit(tdf.T)
    
    ind = ['PC{}'.format(x) for x in 
            range(1, sparse_pca.components_.shape[0] + 1)]
    sparse_components = pd.DataFrame(sparse_pca.components_, 
                                      columns=tdf.index,
                                      index=ind).T
    sparse_components.to_csv(fn, sep='\t')
    cPickle.dump(sparse_pca, open(p, 'wb'))
else:
    sparse_componenets = pd.read_table(fn, index_col=0, header=0)
    sparse_pca = cPickle.load(open(p, 'rb'))

In [ ]:
#plt.scatter(subset_svd_500.u['PC1'], sparse_pca.components_[0], lw=0, alpha=0.8)
ax = sns.jointplot(subset_svd_500.u['PC1'], sparse_pca.components_[0],
                   stat_func=None, alpha=0.25)
plt.xlabel('PCA loading')
plt.ylabel('Sparse PCA loading')
ax.set_axis_labels(xlabel='Distance in kb',
                   ylabel='$-\log_{10}$ $p$-value')
plt.tight_layout()
#plt.title('PCA vs. sparse PCA first eigenvector');

In [ ]:
sparse_pca.components_[0].max()

In [ ]:
ax = sns.jointplot(x=subset_svd_500.u['PC1'].values, y=sparse_pca.components_[0],
                   stat_func=None, alpha=0.5)
ax.set_axis_labels(xlabel='PCA eigenvector',
                   ylabel='Sparse PCA eigenvector')
plt.tight_layout()

In [ ]:
genes = [x.split('.')[0] for x in subset_cov.index]
sig = np.array([False] * len(genes))
sig[0:500] = True
subset_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)

In [ ]:
genes = [x.split('.')[0] for x in subset_cov.index]
sig = pd.Series(np.array([False] * len(genes)), index=subset_cov.index)
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
sig[se[se != 0].index] = True
spc1_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig.values)

In [ ]:
spc1_var_go_results.head()

In [ ]:
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
t = subset_log_tpm_n.ix[se[se != 0].index]
cg = sns.clustermap(
    t, xticklabels=False, yticklabels=False,
    col_colors=color_meta.ix[ct.index, ['sex', 'sequence_id']].T.values)