In [1]:
import cPickle
import glob
import os
import random
import subprocess
import cdpybio as cpb
from ipyparallel import Client
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pybedtools as pbt
from scipy.stats import fisher_exact
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels as sms
import tabix
import vcf as pyvcf
import weblogolib as logo
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'figure_functional_annotation'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
In [2]:
sns.set_style('whitegrid')
In [3]:
fn = os.path.join(ciepy.root, 'output', 'functional_annotation_analysis', 'encode_dnase_res.tsv')
encode_dnase_res = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'functional_annotation_analysis', 'roadmap_dnase_res.tsv')
roadmap_dnase_res = pd.read_table(fn, index_col=0)
fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'epigenome_metadata.tsv')
roadmap_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)
fn = os.path.join(ciepy.root, 'output', 'functional_annotation_analysis', 'encode_stem_cell_chip_seq_res.tsv')
tf_res = pd.read_table(fn, index_col=0)
In [4]:
roadmap_dnase_res = roadmap_dnase_res.join(roadmap_meta[['GROUP', 'TYPE', 'ANATOMY']])
roadmap_dnase_res['name'] = roadmap_ids[roadmap_dnase_res.index].values
In [5]:
print('Tested {} Roadmap cell types.'.format(roadmap_dnase_res.shape[0]))
In [6]:
roadmap_meta.TYPE.value_counts()
Out[6]:
I can convert some of these labels directly, but some I have to break down. In particular, there are a lot of things categorized as 'PrimaryCulture'.
In [7]:
sample_type_conv = {
'PrimaryTissue':'Primary tissue',
'PrimaryCulture':'Primary culture',
'PrimaryCell':'Primary cell',
'ESCDerived':'In vitro differentiated',
'CellLine':'Immortalized line',
}
In [8]:
roadmap_dnase_res['my_cell_type'] = [sample_type_conv[x] for x in roadmap_dnase_res.TYPE]
I want to specifically label the stem and fetal cells from Roadmap.
In [9]:
roadmap_dnase_res.ix[roadmap_dnase_res.ANATOMY == 'IPSC', 'my_cell_type'] = 'Stem cell'
roadmap_dnase_res.ix[roadmap_dnase_res.ANATOMY == 'ESC', 'my_cell_type'] = 'Stem cell'
roadmap_dnase_res.ix[(roadmap_dnase_res.name.apply(lambda x: 'fetal' in x.lower())) &
(roadmap_dnase_res.TYPE == 'PrimaryTissue'), 'my_cell_type'] = 'Primary fetal tissue'
Now I'll choose the top 25 cell types and rename them with more reasonable and consistent names. For instance, ENCODE has both H1-hESC and H9 while Roadmap says "H1 Cell Line". These should be standardized. I also need to standardize capitalization and weird stuff like "skin02" in the Roadmap names.
In [10]:
roadmap_dnase_res_f = roadmap_dnase_res.sort_values(by='pvalue', ascending=False).tail(25)
name_conv = {
'Primary hematopoietic stem cells G-CSF-mobilized Male':'Hematopoietic stem cells (G-CSF mobilized)',
'Primary monocytes fromperipheralblood':'Peripheral blood monocytes',
'Foreskin Fibroblast Primary Cells skin02':'Foreskin fibroblasts',
'Fetal Adrenal Gland':'Fetal adrenal gland',
'A549 EtOH 0.02pct Lung Carcinoma Cell Line':'A549 lung carcinoma line',
'HSMM Skeletal Muscle Myoblasts Cell Line':'HSMM skeletal muscle myoblasts line',
'Fetal Intestine Large':'Fetal large intestine',
'NHLF Lung Fibroblast Primary Cells':'NHLF lung fibroblasts',
'NH-A Astrocytes Cell Line':'NH-A astrocytes',
'HUVEC Umbilical Vein Endothelial Cells Cell Line':'HUVEC umbilical vein endothelial cells',
'Fetal Stomach':'Fetal stomach',
'Primary hematopoietic stem cells G-CSF-mobilized Female':'Hematopoietic stem cells (G-CSF mobilized)',
'Fetal Kidney':'Fetal kidney',
'Breast variant Human Mammary Epithelial Cells (vHMEC)':'vHMEC breast variant mammary epithelial cells',
'Fetal Lung':'Fetal lung',
'Fetal Intestine Small':'Fetal small intestine',
'Ovary':'Ovary',
'Fetal Heart':'Fetal heart',
'HepG2 Hepatocellular Carcinoma Cell Line':'HepG2 hepatocellular carcinoma line',
'Pancreas':'Pancreas',
'H1 Derived Neuronal Progenitor Cultured Cells':'H1 derived neuronal progenitors',
'H1 BMP4 Derived Mesendoderm Cultured Cells':'H1 BMP4 derived mesendoderm',
'iPS DF 6.9 Cell Line':'iPSC (DF 6.9)',
'iPS DF 19.11 Cell Line':'iPSC (DF 19.11)',
'H9 Cell Line':'H9 hESC',
'H1 Cell Line':'H1 hESC',
'HSMM cell derived Skeletal Muscle Myotubes Cell Line':'HSMM-derived skeletal muscle myotubes line',
'Fetal Brain Male':'Fetal brain',
'Fetal Thymus':'Fetal thymus',
'H1 BMP4 Derived Trophoblast Cultured Cells':'H1 BMP4 derived trophoblasts',
'Foreskin Keratinocyte Primary Cells skin02':'Foreskin keratinocytes',
'Small Intestine':'Small intestine'
}
roadmap_dnase_res_f['my_name'] = [name_conv[x] for x in roadmap_dnase_res_f.name]
In [11]:
roadmap_dnase_res['my_name'] = np.nan
roadmap_dnase_res.ix[roadmap_dnase_res_f.index, 'my_name'] = roadmap_dnase_res_f.my_name
In [12]:
a = encode_dnase_res.shape[0]
b = encode_dnase_res.cell_type.value_counts().shape[0]
print('Tested {} samples from {} cell types.'.format(a, b))
print('Tested {} skin fibroblast samples.'.format(sum(encode_dnase_res.cell_type == 'skin fibroblast')))
In [13]:
encode_dnase_res.sort_values(by='pvalue', inplace=True)
pos = ', '.join([str(x + 1) for x in np.where(encode_dnase_res.cell_type == 'skin fibroblast')[0]])
print('The skin fibroblasts were in positions {} in the list of '
'most enriched lines.'.format(pos))
In [14]:
encode_dnase_res.biosample_type.value_counts()
Out[14]:
In [15]:
sample_type_conv = {
'primary cell':'Primary cell',
'immortalized cell line':'Immortalized line',
'in vitro differentiated cells':'In vitro differentiated',
'tissue':'Primary tissue',
'stem cell':'Stem cell',
'induced pluripotent stem cell line':'Stem cell',
}
In [16]:
encode_dnase_res['my_cell_type'] = [sample_type_conv[x] for x in encode_dnase_res.biosample_type]
Now I'll choose the top 25 cell types and rename them with more reasonable and consistent names. This site has info on the ENCODE cell lines.
In [17]:
encode_dnase_res_f = encode_dnase_res.sort_values(by='pvalue', ascending=False).tail(25)
name_conv = {
'H7-hESC':'H7 hESC',
'H1-hESC':'H1 hESC',
'cardiac fibroblast':'Cardiac fibroblasts',
'MCF-7':'MCF-7 breast cancer line',
'cardiac mesoderm':'Cardiac mesoderm',
'skin fibroblast':'Skin fibroblasts',
'dermis blood vessel endothelial cell':'Dermis blood vessel endothelium',
'RWPE1':'RWPE1 prostate prostate epithelial line',
'GM12865':'GM12865 lymphoblastoid line',
'HCT116':'HCT116 colorectal carcinoma line',
'HTR-8/SVneo':'HTR-8/SVneo trophoblast line',
'myoblast':'Myoblasts',
'urothelium cell line':'Urothelium line',
'olfactory neurosphere cell line':'Olfactory neurosphere line',
'induced pluripotent stem cell':'iPSC',
'H9':'H9 hESC',
'NT2/D1':'NT2/D1 pluripotent embryonal carcinoma line',
'K562':'K562',
'Ishikawa':'Ishikawa',
'HEK293':'HEK293',
'SK-N-SH':'SK-N-SH',
'HeLa-S3':'HeLa-S3',
'fibroblast of the conjunctiva':'Conjunctiva fibroblast',
'BE2C':'BE2C',
'fibroblast of lung':'Lung fibroblast',
'HuH-7':'HuH-7'
}
encode_dnase_res_f['my_name'] = [name_conv[x] for x in encode_dnase_res_f.cell_type]
In [18]:
encode_dnase_res['my_name'] = np.nan
encode_dnase_res.ix[encode_dnase_res_f.index, 'my_name'] = encode_dnase_res_f.my_name
In [19]:
tf_res = tf_res.sort_values(by='pvalue').drop_duplicates(subset=['target'])
tf_res.target_type.value_counts()
Out[19]:
I'll keep all of the categories that have "transcription factor."
In [20]:
tf_res = tf_res[(tf_res.target_type != 'broad histone mark, histone, histone modification') &
(tf_res.target_type != 'histone, histone modification, narrow histone mark')]
I'll also remove Pol2.
In [21]:
tf_res = tf_res.drop(tf_res[tf_res.target.apply(lambda x: 'POLR2A' in x)].index)
In [22]:
print('We have {} TF experiments.'.format(tf_res.shape[0]))
n = np.where(tf_res.target == 'NANOG')[0][0]
print('NANOG is #{} in the list of most enriched TF.'.format(n + 1))
n = np.where(tf_res.target == 'POU5F1')[0][0]
print('POU5F1 is #{} in the list of most enriched TF.'.format(n + 1))
In [23]:
tf_res = tf_res.sort_values(by='odds_ratio', ascending=False)
In [24]:
sns.palplot(sns.color_palette('Set2', 7))
In [25]:
cdict = dict(zip(set(roadmap_dnase_res.my_cell_type) | set(encode_dnase_res.my_cell_type),
sns.color_palette('Set2', 7)))
In [36]:
fig = plt.figure(figsize=(6.85, 6.1), dpi=300)
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
ax.text(0, 1, 'Figure 2',
size=16, va='top')
ciepy.clean_axis(ax)
ax.set_xticks([])
ax.set_yticks([])
gs.tight_layout(fig, rect=[0, 0.95, 0.5, 1])
# Roadmap DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in roadmap_dnase_res_f.my_cell_type]
(-np.log10(roadmap_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=6)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(roadmap_dnase_res_f.my_name, fontsize=6)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(6)
gs.tight_layout(fig, rect=[0, 0.45, 0.51, 0.94])
# ENCODE DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in encode_dnase_res_f.my_cell_type]
ax = (-np.log10(encode_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, label=None, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=6)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(encode_dnase_res_f.my_name, fontsize=6)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(6)
gs.tight_layout(fig, rect=[0.49, 0.45, 1, 0.95])
# Color legend
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
ciepy.clean_axis(ax)
rects = []
labels = []
for k in cdict.keys():
if k == 'induced pluripotent stem cell line':
labels.append('iPSC')
elif k == 'in vitro differentiated cells':
labels.append('in vitro\ndifferentiated\ncell')
elif k == 'immortalized cell line':
labels.append('immortalized\ncell line')
else:
labels.append(k)
r = plt.Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper center', fontsize=6, ncol=7)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0.45, 1, 0.5])
# ENCODE ChIP-seq
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
colord = {True:sns.color_palette()[0], False:sns.color_palette()[1]}
tf_res.odds_ratio.plot(ax=ax, kind='bar', lw=0, color=[colord[x] for x in tf_res.pvalue < 0.05])
ax.set_ylabel('Odds ratio', fontsize=8)
ax.set_xlabel('')
ax.grid(axis='x')
ax.set_xticklabels(tf_res.target, fontsize=8)
for t in ax.get_yticklabels():
t.set_fontsize(8)
# Color legend
rects = []
labels = ['$p$ < 0.05', '$p$ > 0.05']
r = plt.Rectangle((0, 0), 0, 0, fc=colord[True])
rects.append(r)
r = plt.Rectangle((0, 0), 0, 0, fc=colord[False])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper right', fontsize=8, frameon=True, fancybox=True)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0, 1, 0.45])
t = fig.text(0.005, 0.92, 'A', weight='bold',
size=12)
t = fig.text(0.5, 0.92, 'B', weight='bold',
size=12)
t = fig.text(0.005, 0.42, 'C', weight='bold',
size=12)
fig.savefig(os.path.join(outdir, 'functional_annotation.png'), dpi=300)
In [27]:
roadmap_dnase_res.pvalue.max()
Out[27]:
In [28]:
roadmap_dnase_res_f.pvalue.max()
Out[28]:
In [29]:
# I'll write these files so I can make them into a table for the paper.
roadmap_dnase_res.sort_values(by='pvalue').to_csv(os.path.join(outdir, 'roadmap_res.tsv'), sep='\t')
encode_dnase_res.sort_values(by='pvalue').to_csv(os.path.join(outdir, 'encode_res.tsv'), sep='\t')
tf_res.sort_values(by='odds_ratio', ascending=False).to_csv(os.path.join(outdir, 'tf_res.tsv'), sep='\t')
In [44]:
(.05) * 6.1 / 5.8
Out[44]:
In [53]:
fig = plt.figure(figsize=(6.85, 5.8), dpi=300)
# Roadmap DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in roadmap_dnase_res_f.my_cell_type]
(-np.log10(roadmap_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=6)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(roadmap_dnase_res_f.my_name, fontsize=6)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(6)
gs.tight_layout(fig, rect=[0, 0.47, 0.51, 1])
# ENCODE DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in encode_dnase_res_f.my_cell_type]
ax = (-np.log10(encode_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, label=None, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=6)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(encode_dnase_res_f.my_name, fontsize=6)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(6)
gs.tight_layout(fig, rect=[0.49, 0.47, 1, 1])
# Color legend
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
ciepy.clean_axis(ax)
rects = []
labels = []
for k in cdict.keys():
if k == 'induced pluripotent stem cell line':
labels.append('iPSC')
elif k == 'in vitro differentiated cells':
labels.append('in vitro\ndifferentiated\ncell')
elif k == 'immortalized cell line':
labels.append('immortalized\ncell line')
else:
labels.append(k)
r = plt.Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper center', fontsize=6, ncol=7)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0.46, 1, 0.52])
# ENCODE ChIP-seq
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
colord = {True:sns.color_palette()[0], False:sns.color_palette()[1]}
tf_res.odds_ratio.plot(ax=ax, kind='bar', lw=0, color=[colord[x] for x in tf_res.pvalue < 0.05])
ax.set_ylabel('Odds ratio', fontsize=8)
ax.set_xlabel('')
ax.grid(axis='x')
ax.set_xticklabels(tf_res.target, fontsize=8)
for t in ax.get_yticklabels():
t.set_fontsize(8)
# Color legend
rects = []
labels = ['$p$ < 0.05', '$p$ > 0.05']
r = plt.Rectangle((0, 0), 0, 0, fc=colord[True])
rects.append(r)
r = plt.Rectangle((0, 0), 0, 0, fc=colord[False])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper right', fontsize=8, frameon=True, fancybox=True)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0, 1, 0.46])
t = fig.text(0.005, 0.97, 'A', weight='bold',
size=12)
t = fig.text(0.5, 0.97, 'B', weight='bold',
size=12)
t = fig.text(0.005, 0.43, 'C', weight='bold',
size=12)
fig.savefig(os.path.join(outdir, 'functional_annotation.pdf'), dpi=300)
In [ ]:
In [ ]:
In [30]:
3 +
In [ ]:
fig = plt.figure(figsize=(10, 6), dpi=300)
fs = 10
# Roadmap DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in roadmap_dnase_res_f.my_cell_type]
(-np.log10(roadmap_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=12)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(roadmap_dnase_res_f.my_name, fontsize=fs)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(fs)
gs.tight_layout(fig, rect=[0, 0.15, 0.51, 1])
# ENCODE DNase
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
c = [cdict[x] for x in encode_dnase_res_f.my_cell_type]
ax = (-np.log10(encode_dnase_res_f.pvalue)).plot(kind='barh', ax=ax, color=c, label=None, lw=0)
ax.set_xlabel('$-\log_{10}$ $p$-value', fontsize=fs)
ax.set_ylabel('')
ax.grid(axis='y')
ax.set_yticklabels(encode_dnase_res_f.my_name, fontsize=fs)
for t in ax.get_xticklabels() + ax.get_yticklabels():
t.set_fontsize(fs)
gs.tight_layout(fig, rect=[0.49, 0.15, 1, 1])
# Color legend
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
ciepy.clean_axis(ax)
rects = []
labels = []
for k in cdict.keys():
if k == 'induced pluripotent stem cell line':
labels.append('iPSC')
elif k == 'in vitro differentiated cells':
labels.append('in vitro\ndifferentiated\ncell')
elif k == 'immortalized cell line':
labels.append('immortalized\ncell line')
else:
labels.append(k)
r = plt.Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper center', fontsize=fs, ncol=4)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0, 1, 0.15])
fig.savefig(os.path.join(outdir, 'dnase_presentation.pdf'))
In [ ]:
fig = plt.figure(figsize=(8, 5), dpi=300)
# ENCODE ChIP-seq
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0, 0])
colord = {True:sns.color_palette()[0], False:sns.color_palette()[1]}
tf_res.odds_ratio.plot(ax=ax, kind='bar', lw=0, color=[colord[x] for x in tf_res.pvalue < 0.05])
ax.set_ylabel('Odds ratio', fontsize=12)
ax.set_xlabel('')
ax.grid(axis='x')
ax.set_xticklabels(tf_res.target, fontsize=11)
for t in ax.get_yticklabels():
t.set_fontsize(12)
# Color legend
rects = []
labels = ['$p$ < 0.05', '$p$ > 0.05']
r = plt.Rectangle((0, 0), 0, 0, fc=colord[True])
rects.append(r)
r = plt.Rectangle((0, 0), 0, 0, fc=colord[False])
rects.append(r)
lgd = ax.legend(rects, labels, loc='upper right', fontsize=12, frameon=True, fancybox=True)
for p in lgd.get_patches():
p.set_linewidth(0)
gs.tight_layout(fig, rect=[0, 0, 1, 1])
fig.savefig(os.path.join(outdir, 'tf_presentation.pdf'))