In [1]:
import copy
import datetime as dt
import glob
import os
import re
import subprocess
import urllib2
import cdpybio as cpb
from ipyparallel import Client
from scipy.stats import fisher_exact
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyencodetools as pet
import pybedtools as pbt
import seaborn as sns
import socket
import statsmodels.stats.multitest as smm
import vcf as pyvcf
import cardipspy as cpy
import ciepy
%matplotlib inline
%load_ext rpy2.ipython
dy_name = 'functional_annotation_analysis'
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
In [2]:
parallel_client = Client(profile='parallel')
dview = parallel_client[:]
print('Cluster has {} engines.'.format(len(parallel_client.ids)))
In [3]:
with dview.sync_imports():
import cdpybio
import os
import pybedtools
import scipy
import scipy.stats
import subprocess
In [4]:
dview.push(dict(outdir=outdir));
In [5]:
%px cpb = cdpybio
%px pbt = pybedtools
In [6]:
tg = pd.read_table(cpy.gencode_transcript_gene, index_col=0,
header=None, squeeze=True)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
# fn = os.path.join(ciepy.root, 'output', 'eqtl_input',
# 'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
# exp = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'qvalues.tsv')
qvalues = pd.read_table(fn, index_col=0, low_memory=False)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'lead_variants.tsv')
lead_vars = pd.read_table(fn, index_col=0, low_memory=False)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'lead_variants_single.tsv')
lead_vars_single = pd.read_table(fn, index_col=0, low_memory=False)
genes = pbt.BedTool(cpy.gencode_gene_bed)
fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)
In [7]:
sig = lead_vars[(lead_vars.perm_sig) & (lead_vars.variant_type != 'cnv')]
sig = sig.sort_values(by='pvalue')
sig = sig.drop_duplicates(subset='gene_id')
In [8]:
n = len(set(sig.location))
print('{:,} distinct lead SNVs.'.format(n))
In [9]:
exons = pbt.BedTool(cpy.gencode_exon_bed)
exons = exons.sort().merge()
In [10]:
# Make bed file for most sig SNVs.
s = set(sig.chrom + '\t' + sig.start.astype(int).astype(str) +
'\t' + sig.end.astype(int).astype(str))
snvs = pbt.BedTool('\n'.join(s) + '\n', from_string=True)
snvs = snvs.sort()
# Get intergenic SNVs.
intergenic_snvs = snvs.intersect(exons, v=True)
# 5kb window centered on SNVs.
intergenic_window = intergenic_snvs.slop(l=2500, r=2500, g=pbt.genome_registry.hg19)
In [11]:
dview.push(dict(intergenic_snvs=intergenic_snvs,
intergenic_window=intergenic_window));
In [12]:
n = len(intergenic_snvs)
print('{:,} intergenic lead SNVs.'.format(n))
In [13]:
# This code makes an input file for GoShifter.
out = os.path.join(outdir, 'goshifter_snps.tsv')
if not os.path.exists(out):
rsids = pd.Series(lead_vars.rsid.values, index=[':'.join(x.split(':')[0:2])
for x in lead_vars.index]).dropna().drop_duplicates()
gos_snvs = intergenic_snvs.to_dataframe()
gos_snvs = gos_snvs[gos_snvs.end - gos_snvs.start == 1]
gos_snvs.index = gos_snvs.chrom + ':' + gos_snvs.start.astype(str) + '-' + gos_snvs.end.astype(str)
gos_snvs = gos_snvs.ix[gos_snvs.index & rsids.index]
gos_snvs.index = rsids[list(gos_snvs.index)]
gos_snvs = gos_snvs.drop('start', axis=1)
gos_snvs.index.name = 'SNP'
gos_snvs.columns = ['Chrom', 'BP']
gos_snvs.to_csv(out, sep='\t')
In [14]:
def calc_goshifter_enrichment_from_url(
url,
variant_file,
outdir,
goshifter_path='/frazer01/home/cdeboever/repos/cdeboever3/goshifter',
ld_path='/publicdata/goshifter_ld_20160727',
num_perms=1000,
):
"""Run GoShifter for a bed file at url."""
import shutil
from subprocess import Popen, PIPE
import tempfile
from urllib2 import urlopen
out = os.path.join(outdir, os.path.split(url)[-1])
# Download bed file.
temp_bed = tempfile.NamedTemporaryFile(delete=False)
req = urlopen(url)
with open(temp_bed.name, 'w') as d:
shutil.copyfileobj(req, d)
# Get cwd, then change to GoShifter directory.
cwd = os.getcwd()
os.chdir(goshifter_path)
c = ('python goshifter.py '
'--snpmap {} '
'--annotation {} '
'--permute {} '
'--ld {} '
'--out {}'.format(variant_file, temp_bed.name, num_perms, ld_path, out))
p = Popen(c, shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
with open(out + '.out', 'w') as f:
f.write(stdout)
with open(out + '.err', 'w') as f:
f.write(stderr)
os.chdir(cwd)
os.remove(temp_bed.name)
In [15]:
dview.push(dict(calc_goshifter_enrichment_from_url=calc_goshifter_enrichment_from_url));
In [16]:
def calc_bed_enrichment_from_url(url, variants, variants_window):
"""Calculate enrichment for bed file from a URL for variants
vs. variants_window"""
bt = pbt.BedTool(cpb.general.read_gzipped_text_url(url), from_string=True)
bt = bt.sort()
bt = bt.merge()
res = variants.intersect(bt, sorted=True, wo=True)
eqtl_in_peak = len(res)
eqtl_out_peak = len(variants) - eqtl_in_peak
res = variants_window.intersect(bt, sorted=True, wo=True)
not_eqtl_in_peak = 0
for r in res:
not_eqtl_in_peak += int(r.fields[-1])
not_eqtl_in_peak -= eqtl_in_peak
total = 0
for r in variants_window:
total += r.length
not_eqtl_out_peak = total - not_eqtl_in_peak - eqtl_in_peak - eqtl_out_peak
oddsratio, p = scipy.stats.fisher_exact([[eqtl_in_peak, eqtl_out_peak],
[not_eqtl_in_peak, not_eqtl_out_peak]])
return url, oddsratio, p
In [17]:
dview.push(dict(calc_bed_enrichment_from_url=calc_bed_enrichment_from_url));
In [18]:
def calc_bed_enrichment(bt):
"""Calculate enrichment for a pybedtools object"""
res = intergenic_snvs.intersect(bt, sorted=True, wo=True)
eqtl_in_peak = len(res)
eqtl_out_peak = len(intergenic_snvs) - eqtl_in_peak
res = intergenic_window.intersect(bt, sorted=True, wo=True)
not_eqtl_in_peak = 0
for r in res:
not_eqtl_in_peak += int(r.fields[-1])
not_eqtl_in_peak -= eqtl_in_peak
total = 0
for r in intergenic_window:
total += r.length
not_eqtl_out_peak = total - not_eqtl_in_peak - eqtl_in_peak - eqtl_out_peak
oddsratio, p = fisher_exact([[eqtl_in_peak, eqtl_out_peak],
[not_eqtl_in_peak, not_eqtl_out_peak]])
return oddsratio, p
In [19]:
a = os.path.join(outdir, 'roadmap_ipsc_peak_pvalues.tsv')
b = os.path.join(outdir, 'roadmap_ipsc_peak_oddsratios.tsv')
if sum([os.path.exists(x) for x in [a, b]]) != 2:
url = ('http://egg2.wustl.edu/roadmap/data/byFileType'
'/peaks/consolidated/narrowPeak/')
website = urllib2.urlopen(url)
html = website.read()
files = re.findall('href="(.*\.gz)"', html)
lines = [x for x in roadmap_ids.index if 'iPS' in roadmap_ids[x]]
files = [x for x in files if x.split('-')[0] in lines]
files = [x for x in files if 'hotspot' not in x]
roadmap_peak_pvals = pd.DataFrame(
-1, index=lines,
columns=set([x.split('-')[1].split('.')[0] for x in files]))
roadmap_peak_oddsratios = pd.DataFrame(
0, index=lines,
columns=set([x.split('-')[1].split('.')[0] for x in files]))
urls = ['http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{}'.format(n)
for n in files]
res = dview.map_sync(lambda x: calc_bed_enrichment_from_url(x, intergenic_snvs, intergenic_window),
urls)
for r in res:
n = os.path.split(r[0])[1]
roadmap_peak_pvals.ix[n.split('-')[0], n.split('-')[1].split('.')[0]] = r[2]
roadmap_peak_oddsratios.ix[n.split('-')[0], n.split('-')[1].split('.')[0]] = r[1]
roadmap_peak_pvals.index = roadmap_ids[roadmap_peak_pvals.index]
roadmap_peak_oddsratios.index = roadmap_ids[roadmap_peak_oddsratios.index]
roadmap_peak_pvals.to_csv(a, sep='\t')
roadmap_peak_oddsratios.to_csv(b, sep='\t')
else:
roadmap_peak_pvals = pd.read_table(a, index_col=0)
roadmap_peak_oddsratios = pd.read_table(b, index_col=0)
In [20]:
sns.heatmap(-np.log10(roadmap_peak_pvals))
plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(outdir, 'roadmap_chip_seq_pval_heatmap.pdf'))
Given the enrichment in DNase peaks, I want to look at the enrichment for other cell types' DNase peaks.
In [21]:
fn = os.path.join(outdir, 'roadmap_dnase_res.tsv')
if not os.path.exists(fn):
url = ('http://egg2.wustl.edu/roadmap/data/byFileType'
'/peaks/consolidated/narrowPeak/')
website = urllib2.urlopen(url)
html = website.read()
files = re.findall('href="(E\d\d\d-DNase.macs2.narrowPeak.gz)"', html)
roadmap_dnase_res = pd.DataFrame(
-1, index=[x.split('-')[0] for x in files],
columns=['odds_ratio', 'pvalue'])
urls = ['http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{}'.format(n)
for n in files]
res = dview.map_sync(lambda x: calc_bed_enrichment_from_url(x, intergenic_snvs, intergenic_window),
urls)
for r in res:
n = os.path.split(r[0])[1]
roadmap_dnase_res.ix[n.split('-')[0], 'odds_ratio'] = r[1]
roadmap_dnase_res.ix[n.split('-')[0], 'pvalue'] = r[2]
roadmap_dnase_res.to_csv(fn, sep='\t')
else:
roadmap_dnase_res = pd.read_table(fn, index_col=0)
roadmap_dnase_res.index = roadmap_ids[roadmap_dnase_res.index]
In [22]:
with sns.axes_style('whitegrid'):
t = roadmap_dnase_res.sort_values(by='pvalue', ascending=False)
fig, ax = plt.subplots(1, 1, figsize=(5, 10))
#(-np.log10(t.pvalue.tail(30))).plot(kind='barh')
(-np.log10(t.pvalue)).plot(kind='barh', ax=ax)
ax.set_xlabel('$-\log_{10}$ $p$-value')
ax.set_ylabel('')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='red', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='red', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='red', linestyle='--')
ax.set_title('DNase peak enrichment among eQTL SNVs')
#fig.tight_layout();
In [23]:
fn = os.path.join(outdir, 'roadmap_dnase_goshifter.tsv')
if not os.path.exists(fn):
url = ('http://egg2.wustl.edu/roadmap/data/byFileType'
'/peaks/consolidated/narrowPeak/')
website = urllib2.urlopen(url)
html = website.read()
files = re.findall('href="(E\d\d\d-DNase.macs2.narrowPeak.gz)"', html)
roadmap_dnase_res = pd.DataFrame(
-1, index=[x.split('-')[0] for x in files],
columns=['odds_ratio', 'pvalue'])
urls = ['http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/{}'.format(n)
for n in files]
cpy.makedir(os.path.join(outdir, 'roadmap_dnase_goshifter'))
res = dview.map_sync(lambda x: calc_goshifter_enrichment_from_url(
x, os.path.join(outdir, 'goshifter_snps.tsv'),
os.path.join(outdir, 'roadmap_dnase_goshifter'), num_perms=1000), urls)
# TODO: write code to combine the GoShifter results files into one dataframe and save as fn.
# for r in res:
# n = os.path.split(r[0])[1]
# roadmap_dnase_res.ix[n.split('-')[0], 'odds_ratio'] = r[1]
# roadmap_dnase_res.ix[n.split('-')[0], 'pvalue'] = r[2]
# roadmap_dnase_res.to_csv(fn, sep='\t')
# else:
# roadmap_dnase_res = pd.read_table(fn, index_col=0)
# roadmap_dnase_res.index = roadmap_ids[roadmap_dnase_res.index]
I want to get all of the ENCODE DNase data. I need to keep track of
The cell below takes a little while to run but only needs to run once because we'll save the results.
In [24]:
# Get DNase experiments.
fn = os.path.join(outdir, 'encode_dnase.tsv')
if not os.path.exists(fn):
s = ('?type=experiment&assay_term_name=DNase-seq&assembly=hg19&'
'files.file_type=bigBed%20narrowPeak')
dnase_exp = pet.search(s, limit=1000)
bad = []
exp_acc = []
cell_type = []
peak_acc = []
peak_url = []
biosample_type = []
for r in dnase_exp:
r.fetch()
keep = []
for f in r.files:
f.fetch()
if f.file_type == 'bed narrowPeak':
keep.append(f)
if len(keep) > 0:
cur_f = keep.pop()
# It seems that some date_created attributes aren't formatted correctly.
# I'll just end up taking a random bed file for these or the one that
# has the date formatted correctly.
try:
year, month, day = [int(x) for x in cur_f.date_created.split('-')]
cur_date = dt.date(year, month, day)
except ValueError:
cur_date = dt.date(1900, 1, 1)
bad.append(r.accession)
while len(keep) > 0:
new_f = keep.pop()
try:
year, month, day = [int(x) for x in new_f.date_created.split('-')]
new_date = dt.date(year, month, day)
except ValueError:
new_date = dt.date(1900, 1, 1)
bad.append(r.accession)
if new_date > cur_date:
cur_f = new_f
cur_date = new_date
exp_acc.append(r.accession)
cell_type.append(r.biosample_term_name)
biosample_type.append(r.biosample_type)
peak_acc.append(cur_f.accession)
peak_url.append('https://www.encodeproject.org{}'.format(cur_f.href))
encode_dnase = pd.DataFrame({'cell_type': cell_type,
'narrowPeak_accession':peak_acc,
'narrowPeak_url': peak_url,
'biosample_type': biosample_type},
index=exp_acc)
encode_dnase.to_csv(fn, sep='\t')
else:
encode_dnase = pd.read_table(fn, index_col=0)
In [25]:
fn = os.path.join(outdir, 'encode_dnase_res.tsv')
if not os.path.exists(fn):
encode_dnase_res = pd.DataFrame(
-1, index=encode_dnase.index,
columns=['odds_ratio', 'pvalue'])
urls = list(encode_dnase.narrowPeak_url)
res = dview.map_sync(lambda x: calc_bed_enrichment_from_url(x, intergenic_snvs, intergenic_window),
urls)
for r in res:
e = encode_dnase[encode_dnase.narrowPeak_url == r[0]].index[0]
encode_dnase_res.ix[e, 'odds_ratio'] = r[1]
encode_dnase_res.ix[e, 'pvalue'] = r[2]
encode_dnase_res = encode_dnase_res.join(encode_dnase)
encode_dnase_res.to_csv(fn, sep='\t')
else:
encode_dnase_res = pd.read_table(fn, index_col=0)
In [ ]:
lab = []
for i in encode_dnase_res.index:
r = pet.fetch(i)
lab.append(r.lab.name)
encode_dnase['lab'] = lab
encode_dnase_res['lab'] = lab
In [ ]:
with sns.axes_style('whitegrid'):
#t = encode_dnase[encode_dnase.biosample_type != 'immortalized cell line']
t = encode_dnase[encode_dnase.lab == 'john-stamatoyannopoulos']
t = encode_dnase_res.ix[t.index].sort_values(by='pvalue', ascending=False).tail(40)
cdict = dict(zip(set(encode_dnase.ix[t.index, 'biosample_type']), sns.color_palette('Set1')))
fig, ax = plt.subplots(1, 1, figsize=(5, 10))
c = [cdict[x] for x in encode_dnase.ix[t.index, 'biosample_type']]
ax = (-np.log10(t.pvalue)).plot(kind='barh', ax=ax, color=c, label=None)
ax.set_ylabel('Cell Type')
ax.set_xlabel('$-\log_{10}$ $p$-value')
ax.set_title('ENCODE DNase enrichments')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='black', linestyle='--')
ax.set_yticklabels(encode_dnase.ix[t.index, 'cell_type'])
rects = []
labels = []
for k in cdict.keys():
labels.append(k)
r = Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='center right',
frameon=True)#, prop={'size':8})
for p in lgd.get_patches():
p.set_linewidth(0)
In [ ]:
with sns.axes_style('whitegrid'):
#t = encode_dnase[encode_dnase.biosample_type != 'immortalized cell line']
t = encode_dnase[encode_dnase.lab == 'gregory-crawford']
t = encode_dnase_res.ix[t.index].sort_values(by='pvalue', ascending=False).tail(40)
cdict = dict(zip(set(encode_dnase.ix[t.index, 'biosample_type']), sns.color_palette('Set1')))
fig, ax = plt.subplots(1, 1, figsize=(5, 10))
c = [cdict[x] for x in encode_dnase.ix[t.index, 'biosample_type']]
ax = (-np.log10(t.pvalue)).plot(kind='barh', ax=ax, color=c, label=None)
ax.set_ylabel('Cell Type')
ax.set_xlabel('$-\log_{10}$ $p$-value')
ax.set_title('ENCODE DNase enrichments')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='black', linestyle='--')
ax.set_yticklabels(encode_dnase.ix[t.index, 'cell_type'])
rects = []
labels = []
for k in cdict.keys():
labels.append(k)
r = Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='center right',
frameon=True)#, prop={'size':8})
for p in lgd.get_patches():
p.set_linewidth(0)
I'd like to progammatically obtain all of the relevant ChIP-seq datasets. I need to go into ChIP-seq experiment and find the narrowPeak files. I'll take one of the narrowPeaks that was most recently released. Often there are two replicates but I think using one will be fine for now. If I want to use both, I probably need to assume that both replicates were released on the same date because it seems like it's hard to figure out which replicate a narrowPeak file is from.
I need to record
In [37]:
# Get ChIP-seq experiments.
fn = os.path.join(outdir, 'encode_stem_cell_chip_seq.tsv')
if not os.path.exists(fn):
s = ('?type=experiment&assay_term_name=ChIP-seq&'
'replicates.library.biosample.donor.organism.scientific_name=Homo%20sapiens&'
'replicates.library.biosample.biosample_type=stem%20cell&files.file_type=bed%20narrowPeak')
chip_seq_exp = pet.search(s, limit=1000)
exp_acc = []
cell_type = []
target = []
target_type = []
peak_acc = []
peak_url = []
for r in chip_seq_exp:
r.fetch()
keep = []
for f in r.files:
f.fetch()
if f.file_type == 'bed narrowPeak':
keep.append(f)
if len(keep) > 0:
cur_f = keep.pop()
year, month, day = [int(x) for x in cur_f.date_created.split('-')]
cur_date = dt.date(year, month, day)
while len(keep) > 0:
new_f = keep.pop()
year, month, day = [int(x) for x in new_f.date_created.split('-')]
new_date = dt.date(year, month, day)
if new_date > cur_date:
cur_f = new_f
cur_date = new_date
exp_acc.append(r.accession)
cell_type.append(r.biosample_term_name)
t = r.target
t.fetch()
target.append(t.label)
target_type.append(', '.join(sorted(t.investigated_as)))
peak_acc.append(cur_f.accession)
peak_url.append('https://www.encodeproject.org{}'.format(cur_f.href))
encode_chip_seq = pd.DataFrame({'cell_type': cell_type,
'target': target,
'target_type': target_type,
'narrowPeak_accession':peak_acc,
'narrowPeak_url': peak_url},
index=exp_acc)
encode_chip_seq.to_csv(fn, sep='\t')
else:
encode_chip_seq = pd.read_table(fn, index_col=0)
In [38]:
fn = os.path.join(outdir, 'encode_stem_cell_chip_seq_res.tsv')
if not os.path.exists(fn):
encode_chip_seq_res = pd.DataFrame(
-1, index=encode_chip_seq.index,
columns=['odds_ratio', 'pvalue'])
urls = list(encode_chip_seq.narrowPeak_url)
res = dview.map_sync(lambda x: calc_bed_enrichment_from_url(x, intergenic_snvs, intergenic_window),
urls)
for r in res:
i = encode_chip_seq[encode_chip_seq.narrowPeak_url == r[0]].index[0]
encode_chip_seq_res.ix[i, 'odds_ratio'] = r[1]
encode_chip_seq_res.ix[i, 'pvalue'] = r[2]
encode_chip_seq_res = encode_chip_seq_res.join(encode_chip_seq)
num_peaks = []
for url in encode_chip_seq_res.narrowPeak_url.values:
s = cpb.general.read_gzipped_text_url(url)
num_peaks.append(len(s.strip().split('\n')))
encode_chip_seq_res['num_peaks'] = num_peaks
encode_chip_seq_res.to_csv(fn, sep='\t')
else:
encode_chip_seq_res = pd.read_table(fn, index_col=0)
In [39]:
encode_chip_seq_res.drop_duplicates(subset=['target']).target_type.value_counts()
Out[39]:
In [40]:
with sns.axes_style('whitegrid'):
t = encode_chip_seq_res.sort_values(by='pvalue', ascending=False)
fig, ax = plt.subplots(1, 1, figsize=(5, 10))
ax = (-np.log10(t.pvalue)).plot(kind='barh', ax=ax)
ax.set_ylabel('Target Type')
ax.set_xlabel('$-\log_{10}$ $p$-value')
ax.set_title('ENCODE hESC ChIP-seq enrichments')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='red', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='red', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='red', linestyle='--')
ax.set_yticklabels(t.target);
In [41]:
encode_chip_seq_res.num_peaks.hist(bins=np.arange(0, 85000, 5000))
plt.xlabel('Number of peaks')
plt.ylabel('Number of experiments');
In [42]:
plt.figure(figsize=(5, 15))
se = encode_chip_seq_res.num_peaks.copy(deep=True)
se.index = encode_chip_seq_res.target
se.sort_values(inplace=True)
se.plot.barh();
In [43]:
plt.scatter(encode_chip_seq_res.num_peaks, -np.log10(encode_chip_seq_res.pvalue))
plt.xlabel('Number of peaks')
plt.ylabel('$-\\log_{10}$ $p$-value');
In [44]:
encode_chip_seq[encode_chip_seq.target == 'POU5F1']
Out[44]:
In [45]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs = axs.flatten()
ax = axs[0]
t = encode_dnase[encode_dnase.biosample_type != 'immortalized cell line']
t = encode_dnase_res.ix[t.index].sort_values(by='pvalue', ascending=False).tail(20)
cdict = dict(zip(set(encode_dnase.ix[t.index, 'biosample_type']), sns.color_palette('Set1')))
#fig, ax = plt.subplots(1, 1, figsize=(5, 10))
c = [cdict[x] for x in encode_dnase.ix[t.index, 'biosample_type']]
ax = (-np.log10(t.pvalue)).plot(kind='barh', ax=ax, color=c, label=None)
ax.set_ylabel('')
ax.set_xlabel('$-\log_{10}$ $p$-value')
#ax.set_title('ENCODE DNase enrichments')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='black', linestyle='--')
ax.set_yticklabels(encode_dnase.ix[t.index, 'cell_type'])
rects = []
labels = []
for k in cdict.keys():
if k == 'induced pluripotent stem cell line':
labels.append('iPSC')
elif k == 'in vitro differentiated cells':
labels.append('in vitro\ndifferentiated\ncell')
else:
labels.append(k)
r = Rectangle((0, 0), 0, 0, fc=cdict[k])
rects.append(r)
lgd = ax.legend(rects, labels, loc='lower right', frameon=True)
#bbox_to_anchor=[0.5, -0.05])#, prop={'size':8}), loc='upper center',
for p in lgd.get_patches():
p.set_linewidth(0)
ax = axs[1]
t = encode_chip_seq_res.sort_values(by='pvalue', ascending=False).tail(30)
#fig, ax = plt.subplots(1, 1, figsize=(5, 10))
ax = (-np.log10(t.pvalue)).plot(kind='barh', ax=ax)
ax.set_ylabel('')
ax.set_xlabel('$-\log_{10}$ $p$-value')
#ax.set_title('ENCODE hESC ChIP-seq enrichments')
ya, yb = ax.get_ylim()
ax.vlines(-np.log10(0.05), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.01), ya, yb, color='black', linestyle='--')
ax.vlines(-np.log10(0.001), ya, yb, color='black', linestyle='--')
ax.set_yticklabels(t.target)
fig.tight_layout()
fig.savefig(os.path.join(outdir, 'enrichments.png'),
bbox_extra_artists=(lgd,), bbox_inches='tight', dpi=600)
In [ ]: