In [1]:
import copy
import glob
import os
import subprocess
import cdpybio as cpb
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import vcf as pyvcf
import ciepy
import cardipspy as cpy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'pluripotency_gene_expression'
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
Each figure should be able to fit on a single 8.5 x 11 inch page. Please do not send figure panels as individual files. We use three standard widths for figures: 1 column, 85 mm; 1.5 column, 114 mm; and 2 column, 174 mm (the full width of the page). Although your figure size may be reduced in the print journal, please keep these widths in mind. For Previews and other three-column formats, these widths are also applicable, though the width of a single column will be 55 mm.
In [2]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0, squeeze=True)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
ext_meta = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211.tsv'), index_col=0,
low_memory=False)
tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv'), index_col=0,
low_memory=False)
ext_tpm = pd.read_table(os.path.join(ciepy.root, 'output', 'input_data', 'GSE73211_tpm.tsv'), index_col=0,
low_memory=False)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
In [14]:
plur_markers = ['POU5F1', 'SOX2', 'NANOG', 'ZFP42', 'LIN28A']
diff_markers = ['T', 'EOMES', 'SOX17', 'FOXA2', 'GATA4', 'HAND1',
'CDX2', 'PAX6', 'SOX1', 'EN1']
def get_gene_id(x):
return gene_info[gene_info.gene_name == x].index[0]
plur_markers = pd.Series(plur_markers, index=[get_gene_id(x) for x in plur_markers])
diff_markers = pd.Series(diff_markers, index=[get_gene_id(x) for x in diff_markers])
url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
if not os.path.exists(os.path.join(private_outdir, os.path.split(url)[-1])):
! wget {url}
! mv nbt.3387-S5.xlsx {private_outdir}
scorecard = pd.read_excel(os.path.join(private_outdir, 'nbt.3387-S5.xlsx'))
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard[scorecard['class'].apply(lambda x: x in ['Mesoderm', 'Pluri'])]
tpm_all = pd.concat([tpm, ext_tpm], axis=1)
In [15]:
scorecard['class'].value_counts()
Out[15]:
In [16]:
tdf = np.log10(tpm_all.ix[scorecard.index].T + 1)
tdf = tdf - tdf.mean()
tdf = tdf / tdf.std()
In [17]:
tdf.columns = gene_info.ix[tdf.columns, 'gene_name']
tdf.columns.name = 'Gene name'
In [18]:
ext_meta.cell_type.value_counts()
Out[18]:
In [19]:
colors = dict(zip(['GSE73211 {}'.format(x) for x in list(set(ext_meta.cell_type))],
sns.color_palette('Set2', 7)[3:6]))
colors['iPSC (this study)'] = sns.color_palette('Set2', 7)[2]
colors = pd.Series(colors)
colors = pd.concat([colors,
pd.Series(dict(zip(list(set(scorecard['class'])), sns.color_palette("Set2", 7))))])
colors.index = [x.replace('Pluri', 'Pluripotency') for x in colors.index]
In [20]:
cs = pd.Series(dict(zip(list(set(ext_meta.cell_type)), sns.color_palette('Set2', 7)[3:])))
rc = ([sns.color_palette('Set2', 7)[2]] * tpm.shape[1]) + list(cs[ext_meta.cell_type])
cs = pd.Series(dict(zip(list(set(scorecard['class'])), sns.color_palette("Set2", 7))))
cc = cs[scorecard['class']]
cm = sns.clustermap(tdf, yticklabels=[], col_colors=cc, row_colors=rc);
cm.cax.set_ylabel('$\\log$ TPM $z$-score')
legend_rects = cpb.analysis.make_color_legend_rects(colors)
for i in colors.index:
cm.ax_heatmap.bar(0, 0, color=colors[i],
label=i, linewidth=0)
cm.ax_heatmap.legend(bbox_to_anchor=(1.35, 1));
cm.cax.text(0, 1.1, 'Figure S2', size=16, va='bottom')
cm.savefig(os.path.join(outdir, 'pluri_heatmap.png'), dpi=300)
In [21]:
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2424064/bin/1471-2164-9-269-S21.xls'
targets = pd.read_excel(url, skiprows=2, index_col=0)
targets = targets[targets['Target Pou5f1'] == 1]
In [22]:
targets = targets[[x.upper() in gene_info.gene_name.values for x in targets.index]]
targets.index = [get_gene_id(x.upper()) for x in targets.index]
print('{} POU5F1 targets in both.'.format(targets.shape[0]))
In [23]:
fn = os.path.join(ciepy.root, 'private_output', 'eqtl_input',
'filtered_all', '0000.vcf.gz')
vcf_reader = pyvcf.Reader(open(fn), compressed=True)
res_fns = glob.glob(os.path.join(ciepy.root, 'private_output', 'run_eqtl_analysis', 'eqtls01',
'gene_results', '*', 'ENS*.tsv'))
res_fns = pd.Series(res_fns,
index=[os.path.splitext(os.path.split(x)[1])[0] for x in res_fns])
In [24]:
gene_id = gene_info[gene_info.gene_name == 'POU5F1'].index[0]
In [25]:
exp = tpm_all[rna_meta[rna_meta.in_eqtl].index]
exp.columns = rna_meta.ix[exp.columns, 'wgs_id']
In [26]:
res = ciepy.read_emmax_output(res_fns[gene_id])
res = res.sort_values('PVALUE')
t = vcf_reader.fetch(res.CHROM.values[0],
res.BEG.values[0],
res.BEG.values[0] + 1)
gts = pd.DataFrame(index=exp.columns)
r = t.next()
gts['Genotype'] = 0
hets = set(exp.columns) & set([s.sample for s in r.get_hets()])
gts.ix[hets, 'Genotype'] = 1
alts = set(exp.columns) & set([s.sample for s in r.get_hom_alts()])
gts.ix[alts, 'Genotype'] = 2
In [27]:
gts.Genotype.value_counts()
Out[27]:
In [60]:
cs = pd.Series(dict(zip([0, 1, 2], sns.color_palette('Set2', 7)[3:])))
rc = cs[gts.ix[exp.columns, 'Genotype']]
t = np.log10(exp.ix[targets.index].T + 1)
t = t - t.mean()
t = t / t.std()
t[t > 5] = 5
t[t < -5] = -5
In [62]:
cm = sns.clustermap(t, xticklabels=[], yticklabels=[], row_colors=rc.values)
cm.cax.set_ylabel('$\\log$ TPM $z$-score')
cm.ax_heatmap.set_ylabel('Samples')
cm.ax_heatmap.set_xlabel('POU5F1 target genes')
legend_rects = cpb.analysis.make_color_legend_rects(cs)
for i in cs.index:
cm.ax_heatmap.bar(0, 0, color=colors[i],
label=i, linewidth=0)
cm.ax_heatmap.legend(bbox_to_anchor=(1.13, 1), title='POU5F1\neQTL\ngenotype');
cm.savefig(os.path.join(outdir, 'pou5f1_heatmap.png'), dpi=300);
In [29]:
tsvd = cpb.analysis.SVD(t.T, mean_center=False)
In [30]:
tsvd.plot_pc_scatter('PC1', 'PC2', color=gts.Genotype);
In [31]:
res = tsvd.pc_anova(gts, num_pc=10)
res.ix[:, 'Genotype', :]
Out[31]:
In [32]:
tsvd.plot_pc_scatter('PC1', 'PC5', color=gts.Genotype);
In [33]:
texp = exp[(exp > 2).sum(axis=1) >= 10]
texp = np.log10(texp + 1)
texp = texp.T - texp.mean(axis=1)
texp = (texp / texp.std()).T
In [34]:
texpsvd = cpb.analysis.SVD(texp, mean_center=False)
In [35]:
texpsvd.plot_pc_scatter('PC1', 'PC2', color=gts.Genotype);
In [36]:
res = texpsvd.pc_anova(gts, num_pc=10)
res.ix[:, 'Genotype', :]
Out[36]:
In [37]:
texp_sc = texp.ix[scorecard.index].dropna()
In [38]:
cm = sns.clustermap(texp_sc, xticklabels=[], yticklabels=[], col_colors=cs.ix[gts.ix[texp.columns, 'Genotype']].values);
In [ ]: