RNA-Seq Analysis

I'd like to explore some different aspects of the expression profiles for the different samples. There are some additional samples besides those used for the eQTL study because for some people we sequenced multiple clones etc.

I'll generally be looking at the all of the data but I'll also look at a subset of European individuals from different families to remove some of the genetic and ancestry signal. While it's true that some family members are genetically unrelated, I will just take one person per family to avoid similarities from shared familial environment.



In [1]:

    
import copy
import cPickle
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.linalg import svd
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf

import cardipspy as cpy
import ciepy

%matplotlib inline
%load_ext rpy2.ipython

outdir = os.path.join(ciepy.root, 'output',
                      'rna_seq_analysis')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'rna_seq_analysis')
cpy.makedir(private_outdir)



In [2]:

    
sns.set_context('notebook')
sns.set_style('whitegrid')



In [3]:

    
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0, squeeze=True)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)

ext_meta = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211.tsv'), index_col=0,
                         low_memory=False)

tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv'), index_col=0,
                                 low_memory=False)
ext_tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211_tpm.tsv'), index_col=0,
                        low_memory=False)

gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'star_logs.tsv')
star_logs = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'picard_metrics.tsv')
picard = pd.read_table(fn, index_col=0)



In [4]:

    
meta = rna_meta.merge(subject_meta, left_on='subject_id', 
                      right_index=True, how='inner')

# Get unrelated European subjects. tdf = subject_meta[subject_meta.ethnicity_group == 'European'] a = tdf[tdf.family_id.isnull()] b = tdf.dropna(subset=['family_id']) b = b.drop_duplicates(subset=['family_id']) euro_no_fam_subject = pd.concat([a, b]) print('Number of unrelated Europeans: {}'.format(euro_no_fam_subject.shape[0]))

t = rna_meta.sort(columns='in_eqtl', ascending=False).drop_duplicates(subset='subject_id') euro_no_fam = euro_no_fam_subject.merge(t, left_index=True, right_on='subject_id')

meta = rna_meta.merge(subject_meta, left_on='subject_id', right_index=True, how='outer') color_meta = pd.DataFrame(index=meta.index) color_meta['sex'] = [(1, 1, 1)] * color_meta.shape[0] ind = meta[meta.sex == 'M'].index color_meta.ix[ind, 'sex'] = [sns.color_palette()[0]] * ind.shape[0] cdict = dict(zip(set(meta.sequence_id), sns.color_palette()[1:4])) color_meta['sequence_id'] = [cdict[x] for x in meta.sequence_id] cdict = dict(zip(set(meta.estimated_ethnicity), sns.color_palette("Set2", 5))) color_meta['estimated_ethnicity'] = [cdict[x] for x in meta.estimated_ethnicity] cdict = dict(zip(set(meta.family_id), len(set(meta.family_id)) * [(1, 1, 1)])) for i, k in enumerate(meta.family_id.value_counts().head(8).index): cdict[k] = sns.color_palette('Set1', 8)[i] cdict[np.nan] = (1, 1, 1) color_meta['family_id'] = [cdict[x] for x in meta.family_id]

STAR Logs



In [5]:

    
fig, axs = plt.subplots(2, 2)
axs = axs.flatten()
ax = axs[0]
k = 'Number of input reads'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[1]
k = 'Uniquely mapped reads number'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[2]
k = 'Uniquely mapped reads %'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
ax = axs[3]
k = 'Average mapped length'
star_logs[k].hist(ax=ax)
ax.set_ylabel('Number of samples')
ax.set_xlabel(k)
plt.tight_layout()



In [6]:

    
n = star_logs['Number of input reads'].mean()
print('Average of {:.2f} million read pairs per sample'.format(n / 10**6))









    



Average of 21.59 million read pairs per sample



In [7]:

    
star_logs['Uniquely mapped reads %'].min()









    Out[7]:





85.969999999999999



In [8]:

    
star_logs['Uniquely mapped reads %'].median()









    Out[8]:





91.02



In [9]:

    
wgs_meta.cell.value_counts()









    Out[9]:





Blood         255
Fibroblast     19
Name: cell, dtype: int64

Picard Metrics



In [39]:

    
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].hist();



In [41]:

    
picard.PERCENT_DUPLICATION.median()









    Out[41]:





0.158586



In [40]:

    
picard[['MEDIAN_INSERT_SIZE', 'PERCENT_DUPLICATION', u'MEDIAN_5PRIME_TO_3PRIME_BIAS']].describe()









    Out[40]:






  
    
      
      MEDIAN_INSERT_SIZE
      PERCENT_DUPLICATION
      MEDIAN_5PRIME_TO_3PRIME_BIAS
    
  
  
    
      count
      250.000000
      250.000000
      248.000000
    
    
      mean
      286.136000
      0.159617
      0.871093
    
    
      std
      21.356166
      0.022173
      0.055770
    
    
      min
      223.000000
      0.103577
      0.547427
    
    
      25%
      271.000000
      0.145569
      0.848672
    
    
      50%
      287.500000
      0.158586
      0.878228
    
    
      75%
      300.750000
      0.171944
      0.903255
    
    
      max
      352.000000
      0.238257
      0.999235

Expression Distribution



In [10]:

    
s = (tpm == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');









    



250    19020
0      15313
dtype: int64

We can see overal that there are a fair number of genes that are not expressed in any samples.



In [11]:

    
s = (tpm[gene_info.ix[tpm.index, 'gene_type'] == 'protein_coding'] == 0).sum(axis=1)
print(s.value_counts().head(2))
s.hist(bins=100)
plt.title('Number of samples with zero TPM for each protein coding gene')
plt.ylabel('Number of genes')
plt.xlabel('Number of samples');









    



0      13524
250      906
dtype: int64

Protein coding genes are highly enriched for being expressed in all samples.

Pluripotency gene expression



In [12]:

    
plur_markers = ['POU5F1', 'SOX2', 'NANOG', 'ZFP42', 'LIN28A']
diff_markers = ['T', 'EOMES', 'SOX17', 'FOXA2', 'GATA4', 'HAND1', 
                'CDX2', 'PAX6', 'SOX1', 'EN1']
def get_gene_id(x):
    return gene_info[gene_info.gene_name == x].index[0]
plur_markers = pd.Series(plur_markers, index=[get_gene_id(x) for x in plur_markers])
diff_markers = pd.Series(diff_markers, index=[get_gene_id(x) for x in diff_markers])

url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
scorecard = pd.read_excel(url)
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard[scorecard['class'].apply(lambda x: x in ['Mesoderm', 'Pluri'])]

tpm_all = pd.concat([tpm, ext_tpm], axis=1)



In [13]:

    
tdf = np.log10(tpm_all.ix[scorecard.index].T + 1)
tdf = tdf - tdf.mean()
tdf = tdf / tdf.std()



In [14]:

    
cs = pd.Series(dict(zip(list(set(ext_meta.cell_type)), sns.color_palette('colorblind')[3:6])))
rc = ([sns.color_palette('colorblind')[2]] * tpm.shape[1]) + list(cs[ext_meta.cell_type])
cs = pd.Series(dict(zip(list(set(scorecard['class'])), sns.color_palette("Set2", 7))))
cc = cs[scorecard['class']]

sns.clustermap(tdf, xticklabels=[], yticklabels=[], col_colors=cc, row_colors=rc);#, col_cluster=False);

All samples

Variable Genes

I'm interested in what genes are variable among all samples



In [15]:

    
# Filter for robustly expressed genes and take log.
tpm_f = tpm[(tpm == 0).sum(axis=1) == 0]
log_tpm = np.log10(tpm_f + 1)
# Mean center.
log_tpm_c = (log_tpm.T - log_tpm.mean(axis=1)).T
# Variance normalize.
log_tpm_n = (log_tpm_c.T / log_tpm_c.std(axis=1)).T



In [16]:

    
cov = log_tpm.std(axis=1) / log_tpm.mean(axis=1)
cov.sort_values(ascending=False, inplace=True)
cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');

genes = [x.split('.')[0] for x in cov.index] sig = np.array([False] * len(genes)) sig[0:1000] = True var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)

var_go_results.head()

Clustering

#c = log_tpm_n.corr(method='spearman') cg = sns.clustermap( c, xticklabels=False, yticklabels=False, row_colors=color_meta.ix[c.index, ['estimated_ethnicity', 'sex', 'sequence_id']].T.values, col_colors=color_meta.ix[c.index, ['family_id']].T.values)

t = log_tpm_n.ix[cov.head(500).index] ct = t.corr(method='spearman') cg = sns.clustermap( ct, xticklabels=False, yticklabels=False, row_colors=color_meta.ix[ct.index, ['estimated_ethnicity', 'sex', 'sequence_id']].T.values, col_colors=color_meta.ix[ct.index, ['family_id']].T.values)

SVD



In [17]:

    
res = cpb.analysis.SVD(log_tpm, scale_variance=True)



In [18]:

    
res.plot_variance_explained(xtick_start=10, xtick_spacing=20, cumulative=False)



In [19]:

    
res.plot_variance_explained(num_pc=30)



In [20]:

    
pc_anova = res.pc_anova(meta[['sex', 'ethnicity_group', 'sequence_id']])
pc_anova.pvalue









    Out[20]:






  
    
      
      PC1
      PC2
      PC3
      PC4
      PC5
    
  
  
    
      sex
      0.358010
      1.514248e-01
      8.814301e-01
      0.019714
      0.884173
    
    
      ethnicity_group
      0.174594
      9.031497e-01
      2.686777e-01
      0.358136
      0.335773
    
    
      sequence_id
      0.027947
      4.124048e-26
      3.291938e-07
      0.000669
      0.989441



In [21]:

    
fig, ax = res.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')









    



/frazer01/home/cdeboever/software/anaconda/envs/cie/lib/python2.7/site-packages/matplotlib/axes/_axes.py:492: UserWarning: You have mixed positional and keyword arguments, some input will be discarded.
  warnings.warn("You have mixed positional and keyword "



In [22]:

    
fig, ax = res.plot_pc_scatter(
    'PC2', 'PC3',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')



In [23]:

    
fig, ax = res.plot_pc_scatter(
    'PC3', 'PC4',
    color=meta.ix[res.v.index, 'sequence_id'],
    color_name='Flowcell')

The ANOVA and PC plots show that the different batches on the flowcells definitely have an effect on the expression values. The flowcell with ID 23 was done a month or so after the other three flowcells (23 had the samples that failed QC from the first three flowcells) so it's not surprising that it's more different.

Subset of Samples



In [24]:

    
a = meta.dropna(subset=['family_id']).drop_duplicates(subset=['family_id'])
b = meta[meta.family_id.isnull()]
euro_no_fam = pd.concat([a, b])



In [25]:

    
# Filter for robustly expressed genes and take log.
subset_tpm_f = tpm.ix[(tpm == 0).sum(axis=1) == 0, euro_no_fam.index]
subset_log_tpm = np.log10(subset_tpm_f + 1)
# Mean center.
subset_log_tpm_c = (subset_log_tpm.T - subset_log_tpm.mean(axis=1)).T
# Variance normalize.
subset_log_tpm_n = (subset_log_tpm_c.T / subset_log_tpm_c.std(axis=1)).T



In [26]:

    
subset_cov = subset_log_tpm.std(axis=1) / subset_log_tpm.mean(axis=1)
subset_cov.sort_values(ascending=False, inplace=True)
subset_cov.hist()
plt.title('Coefficient of variation histogram')
plt.xlabel('Coefficient of variation')
plt.ylabel('Number of genes');



In [27]:

    
subset_svd = cpb.analysis.SVD(subset_log_tpm, scale_variance=True)



In [28]:

    
subset_svd.plot_variance_explained(xtick_start=10, xtick_spacing=10, cumulative=False)



In [29]:

    
subset_svd.plot_variance_explained(num_pc=30)



In [30]:

    
fig, ax = subset_svd.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[subset_svd.v.index, 'sequence_id'],
    marker=meta.ix[subset_svd.v.index, 'sex'],
    color_name='Flowcell',
    marker_name='Sex')



In [31]:

    
subset_svd_500 = cpb.analysis.SVD(subset_log_tpm_n.ix[subset_cov.head(500).index])



In [32]:

    
fig, ax = subset_svd_500.plot_pc_scatter(
    'PC1', 'PC2',
    color=meta.ix[subset_svd.v.index, 'sequence_id'],
    marker=meta.ix[subset_svd.v.index, 'sex'],
    color_name='Flowcell',
    marker_name='Sex')

TODO: Working here. Update stuff below.



In [33]:

    
t = subset_log_tpm_n.ix[subset_cov.head(500).index]
ct = t.corr(method='spearman')
cg = sns.clustermap(
    ct, xticklabels=False, yticklabels=False,
    row_colors=color_meta.ix[ct.index, ['sex']].T.values,
    col_colors=color_meta.ix[ct.index, ['sequence_id']].T.values)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-3f9f674290ef> in <module>()
      3 cg = sns.clustermap(
      4     ct, xticklabels=False, yticklabels=False,
----> 5     row_colors=color_meta.ix[ct.index, ['sex']].T.values,
      6     col_colors=color_meta.ix[ct.index, ['sequence_id']].T.values)

NameError: name 'color_meta' is not defined

genes = [x.split('.')[0] for x in subset_cov.index] sig = np.array([False] * len(genes)) sig[0:500] = True subset_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)



In [ ]:

    
tdf = subset_log_tpm_n.ix[subset_cov.head(500).index]
fn = os.path.join(outdir, 'subset_sparse_pca.tsv')
p = os.path.join(outdir, 'subset_sparse_pca.pickle')
if not os.path.exists(fn):
    from sklearn import decomposition
    sparse_pca = decomposition.SparsePCA(n_components=5, n_jobs=30)
    sparse_pca.fit(tdf.T)
    
    ind = ['PC{}'.format(x) for x in 
            range(1, sparse_pca.components_.shape[0] + 1)]
    sparse_components = pd.DataFrame(sparse_pca.components_, 
                                      columns=tdf.index,
                                      index=ind).T
    sparse_components.to_csv(fn, sep='\t')
    cPickle.dump(sparse_pca, open(p, 'wb'))
else:
    sparse_componenets = pd.read_table(fn, index_col=0, header=0)
    sparse_pca = cPickle.load(open(p, 'rb'))



In [ ]:

    
#plt.scatter(subset_svd_500.u['PC1'], sparse_pca.components_[0], lw=0, alpha=0.8)
ax = sns.jointplot(subset_svd_500.u['PC1'], sparse_pca.components_[0],
                   stat_func=None, alpha=0.25)
plt.xlabel('PCA loading')
plt.ylabel('Sparse PCA loading')
ax.set_axis_labels(xlabel='Distance in kb',
                   ylabel='$-\log_{10}$ $p$-value')
plt.tight_layout()
#plt.title('PCA vs. sparse PCA first eigenvector');



In [ ]:

    
sparse_pca.components_[0].max()



In [ ]:

    
ax = sns.jointplot(x=subset_svd_500.u['PC1'].values, y=sparse_pca.components_[0],
                   stat_func=None, alpha=0.5)
ax.set_axis_labels(xlabel='PCA eigenvector',
                   ylabel='Sparse PCA eigenvector')
plt.tight_layout()



In [ ]:

    
genes = [x.split('.')[0] for x in subset_cov.index]
sig = np.array([False] * len(genes))
sig[0:500] = True
subset_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig)



In [ ]:

    
genes = [x.split('.')[0] for x in subset_cov.index]
sig = pd.Series(np.array([False] * len(genes)), index=subset_cov.index)
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
sig[se[se != 0].index] = True
spc1_var_go_results = cpb.analysis.goseq_gene_enrichment(genes, sig.values)



In [ ]:

    
spc1_var_go_results.head()



In [ ]:

    
se = pd.Series(sparse_pca.components_[0], index=tdf.index)
t = subset_log_tpm_n.ix[se[se != 0].index]
cg = sns.clustermap(
    t, xticklabels=False, yticklabels=False,
    col_colors=color_meta.ix[ct.index, ['sex', 'sequence_id']].T.values)

	MEDIAN_INSERT_SIZE	PERCENT_DUPLICATION	MEDIAN_5PRIME_TO_3PRIME_BIAS
count	250.000000	250.000000	248.000000
mean	286.136000	0.159617	0.871093
std	21.356166	0.022173	0.055770
min	223.000000	0.103577	0.547427
25%	271.000000	0.145569	0.848672
50%	287.500000	0.158586	0.878228
75%	300.750000	0.171944	0.903255
max	352.000000	0.238257	0.999235

	PC1	PC2	PC3	PC4	PC5
sex	0.358010	1.514248e-01	8.814301e-01	0.019714	0.884173
ethnicity_group	0.174594	9.031497e-01	2.686777e-01	0.358136	0.335773
sequence_id	0.027947	4.124048e-26	3.291938e-07	0.000669	0.989441