In [ ]:
# Import built-in modules
import atexit
import bisect
import collections
import copy
import csv
import datetime
import hashlib
import datetime
import math
import io
import os
import random
import subprocess
import sys
import tempfile
import os
import re
import pickle
import shutil
import subprocess
import sys
import operator
import inspect
import itertools
import time
import __main__
import datetime
import inspect
import os
import pickle
import glob
import shutil
import subprocess
import sys
import gzip
import contextlib
import tempfile
from pprint import pprint
%config InlineBackend.figure_format='retina'
from functools import reduce
import functools
In [ ]:
import joblib
def pmap(f, l, n_jobs=10, verbose=5):
return joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(joblib.delayed(f)(l_i) for l_i in l)
In [ ]:
# Import custom modules
import numpy as np
import scipy as sp
import scipy.signal
import scipy.cluster.hierarchy
import sklearn
import pandas as pd
import pandas.plotting
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_venn
import mpl_toolkits.axes_grid1
from mpl_toolkits.axes_grid1 import ImageGrid
import HTSeq as hts
import hmmlearn
import hmmlearn.hmm
import pyBigWig
import pybedtools
from pybedtools import BedTool
import twobitreader
def listify(x):
if callable(getattr(self, "tolist", None)):
return x.tolist()
else:
return x
#def read_regions(fp, chroms, starts, ends, f=None):
# fh = pyBigWig.open(fp)
# for chrom, start_, end_ in zip(chroms, starts, ends):
# # fixes pyBigWig:
# # RuntimeError: You must supply a chromosome, start and end position.
# start = int(start_)
# end = int(end_)
# if f is None:
# yield fh.values(chrom, start, end)
# else:
# yield f(np.array(fh.values(chrom, start, end)))
#w fh.close()
In [ ]:
# Custom module with code for aggregate plots, heatmaps, etc
%load_ext autoreload
%autoreload 2
sys.path.append(os.path.expanduser('~/relmapping/scripts/yarp'))
import yarp as yp
In [ ]:
%matplotlib inline
In [ ]:
# Color-blindness safe colors from http://www.nature.com/nmeth/journal/v8/n6/full/nmeth.1618.html
BLACK = '#000000' # 0,0,0
ORANGE = '#e69f00' # 230,159,0
SKYBLUE = '#56b4e9' # 86,180,233
GREEN = '#009e73' # 0,158,115
YELLOW = '#f0e442' # 240,228,66
BLUE = '#0072b2' # 0,114,178
RED = '#d55e00' # 213,94,0
PURPLE = '#cc79a7' # 204,121,167
NAMES_BED6 = ('chrom', 'start', 'end', 'name', 'score', 'strand')
NAMES_BED9 = ('chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb')
In [ ]:
def pf(id, step, suffix, prefix=''): return os.path.join(prefix, step, '%(id)s.%(step)s%(suffix)s' % locals())
def lp(fp): return os.path.join(os.path.expanduser('~/lab/HTSProcessing'), fp) # Path relative to old cluster pipeline
In [ ]:
os.chdir(os.path.expanduser('~/relmapping'))
print('os.getcwd():', os.getcwd())
In [ ]:
import yaml
with open('workflows/config.yaml') as fh:
config = collections.OrderedDict([(k, v) for k, v in yaml.load(fh).items()])
In [ ]:
def deseq2x2_greater(df_counts, prefix=None):
fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
df_counts.to_csv(fp_inp, sep='\t')
!cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
#!wc -l {fp_inp}
#!wc -l {fp_out}
#!tail -n 20 {fp_out}
df_out = pd.read_csv(fp_out, sep='\s+')
!rm {fp_inp}
!rm {fp_out}
if not(prefix is None):
df_out.columns = [prefix + '_' + column for column in df_out.columns]
return df_out
def deseq2x2_greaterAbs(df_counts, prefix=None):
fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
df_counts.to_csv(fp_inp, sep='\t')
!cat {fp_inp} | scripts/deseq2x2_greaterAbs.R > {fp_out}
#!wc -l {fp_inp}
#!wc -l {fp_out}
#!tail -n 20 {fp_out}
df_out = pd.read_csv(fp_out, sep='\s+')
!rm {fp_inp}
!rm {fp_out}
if not(prefix is None):
df_out.columns = [prefix + '_' + column for column in df_out.columns]
return df_out
In [ ]:
def write_gff(fp, chrom, source='.', feature='.', start=float('nan'), end=float('nan'), score='.', strand='.', frame='.', attr=None, v=False):
df = pd.DataFrame()
def force_iter(l): return l if hasattr(l, '__iter__') else itertools.repeat(l, len(df))
df['chrom'] = list(chrom)
df['source'] = force_iter(source)
df['feature'] = force_iter(feature)
df['start'] = force_iter(start)
df['end'] = force_iter(end)
df['score'] = force_iter(score)
df['strand'] = force_iter(strand)
df['frame'] = force_iter(frame)
def pack_row(ir): return (" ; ".join([("%s %s" % (k, v)) for k, v in zip(ir[1].index, ir[1])]))
df['attr'] = list(map(pack_row, attr.iterrows()))
df.to_csv(fp, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
if v: return df
def write_gffbed(fp,
chrom, start, end, name='', attr=None, score=0, strand='.',
thickStart=None, thickEnd=None, itemRgb=yp.BLUE,
trackline='#track gffTags=on', v=False):
df = pd.DataFrame()
def force_iter(l_):
try:
l = list(l_)
if (len(l) > 1) and not(type(l_) is str):
return l
else:
return list(itertools.repeat(l_, len(df)))
except TypeError:
return list(itertools.repeat(l_, len(df)))
#return list(l) if hasattr(l, '__iter__') else list(itertools.repeat(l, len(df)))
df['chrom'] = list(chrom)
df['start'] = force_iter(start)
df['end'] = force_iter(end)
def pack_row(ir): return (";".join([("%s=%s" % (k, v)).replace(" ", "%20") for k, v in zip(ir[1].index, ir[1])]))
attr_ = pd.concat([pd.DataFrame({'Name': force_iter(name)}), attr], axis=1)
df['name'] = list(map(pack_row, attr_.iterrows()))
df['score'] = force_iter(score)
df['strand'] = force_iter(strand)
if not(thickStart is None):
df['thickStart'] = force_iter(thickStart)
else:
df['thickStart'] = df['start'].copy().tolist()
if not(thickEnd is None):
df['thickEnd'] = force_iter(thickEnd)
else:
df['thickEnd'] = df['end'].copy().tolist()
df['itemRgb'] = force_iter(itemRgb)
with open(fp, 'w') as fh:
print(trackline, file=fh)
df.sort_values(['chrom', 'start', 'end']).to_csv(fh, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
if v: return df
def read_gffbed(fp, parse_attr=[], *args, **kwargs):
df = pd.read_csv(fp, sep='\t', names=yp.NAMES_BED9, comment='#', *args, **kwargs)
return yp.df_gfftags_unpack(df, name='name') # Does not preserve attribute order...
In [ ]:
def df_atac_():
fp_ = 'annot_eLife_revised/Figure 1-source data 1. Accessible sites.txt'
df_atac = pd.read_csv(fp_, sep='\t')
df_atac['mode'] = df_atac[['start', 'end']].mean(axis=1).map(int)
return df_atac
def df_mode_(flank_len = 0):
df_atac = df_atac_()
df_mode = pd.DataFrame()
df_mode['chrom'] = df_atac['chrom'].copy()
df_mode['start'] = df_atac['mode'] - flank_len
df_mode['end'] = df_atac['mode'] + flank_len + 1
return df_mode
In [ ]:
def addon_atac_height(df_regl):
fp_sites = 'annot/S1_accessible_sites/S1a_accessible_sites.tsv'
df_sites = pd.read_csv(fp_sites, sep='\t')#.query('(atac_source == "atac_wt_pe") | (atac_source == "atac_wt_se")')
df_regl['atac_wt_mean_height'] = df_sites[['atac_%s_height' % (stage,) for stage in config['stages_wt']]].mean(axis=1)
df_regl['atac_wt_max_height'] = df_sites[['atac_%s_height' % (stage,) for stage in config['stages_wt']]].max(axis=1)
#return df_regl
def addon_associated_gene_id(df_regl):
def promoter_gene_id_(annot_fwd, annot_rev, promoter_gene_id_fwd, promoter_gene_id_rev):
if annot_fwd == 'coding_promoter' and annot_rev == 'coding_promoter':
return ','.join([promoter_gene_id_fwd, promoter_gene_id_rev])
elif annot_fwd == 'coding_promoter':
return promoter_gene_id_fwd
elif annot_rev == 'coding_promoter':
return promoter_gene_id_rev
else:
return '.'
df_pclosest_inp = regl_mode()
df_pclosest_inp['gene_id'] = list(map(promoter_gene_id_, df_regl['annot_fwd'], df_regl['annot_rev'],
df_regl['promoter_gene_id_fwd'], df_regl['promoter_gene_id_rev']))
df_pclosest_inp['locus_id'] = list(map(promoter_gene_id_, df_regl['annot_fwd'], df_regl['annot_rev'],
df_regl['promoter_locus_id_fwd'], df_regl['promoter_locus_id_rev']))
df_pclosest_inp = df_pclosest_inp[df_regl['annot'] == "coding_promoter"].reset_index(drop=True)
def df_closest(df_a, df_b, b_prefix, **kwargs):
df_a_sort = df_a
df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
names_ = list(itertools.chain(df_a.columns.values,
['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
['%s_distance' % (b_prefix)]
))
df_ = pd.read_csv(fn_, names=names_, sep='\t')
return df_[names_[len(df_a.columns):]]
df_pclosest_out = df_closest(regl_mode(), df_pclosest_inp, b_prefix='pclosest', D='ref', t='first')
m_ = (df_regl['annot'] != 'coding_promoter') & (df_regl['associated_gene_id'] == '.')
print('%d non-promoters outside of outron/gene body (=no gene_id)' % (sum(m_),))
df_regl.loc[m_, 'associated_gene_id'] = df_pclosest_out.loc[m_, 'pclosest_gene_id']
df_regl.loc[m_, 'associated_locus_id'] = df_pclosest_out.loc[m_, 'pclosest_locus_id']
m_ = (df_regl['annot'] != 'coding_promoter') & (df_regl['associated_gene_id'] == '.')
assert sum(m_) == 0 # all of putative_enhancer should have a closest promoter, and a gene_id
#return df_regl
def addon_cv(df_regl):
#q_ = 'Gerstein2014_top15k_CV_rank == Gerstein2014_top15k_CV_rank'
q_ = '(Gerstein2014_CV == Gerstein2014_CV) & (Gerstein2014_max_rank < 15000)'
df_cv_inp = pd.read_csv('WS260_ce10/WS260_ce10.genes_by_CV.tsv', sep='\t')\
.query(q_)[['gene_id', 'Gerstein2014_CV']]
df_cv_out = pd.DataFrame()
df_cv_out['prom_CV'] = list(map(lambda x, y: np.nanmean([x, y]),
df_regl.merge(df_cv_inp, how='left', left_on='promoter_gene_id_fwd', right_on='gene_id')['Gerstein2014_CV'],
df_regl.merge(df_cv_inp, how='left', left_on='promoter_gene_id_rev', right_on='gene_id')['Gerstein2014_CV'],
))
df_cv_out['enh_CV'] = df_regl.merge(df_cv_inp, how='left', left_on='associated_gene_id', right_on='gene_id')['Gerstein2014_CV']
print('%d of %d sites with CV values via promoter annotation' % (len(df_cv_out.query('prom_CV == prom_CV')), len(df_cv_out)))
print('%d of %d sites with CV values via "associated gene"' % (len(df_cv_out.query('enh_CV == enh_CV')), len(df_cv_out)))
m_ = (df_regl['annot'] == 'coding_promoter') | (df_regl['annot'] == 'pseudogene_promoter')
df_regl.loc[m_, 'CV'] = df_cv_out.loc[m_, 'prom_CV']
df_regl.loc[~m_, 'CV'] = df_cv_out.loc[~m_, 'enh_CV']
#return df_regl
def regl_addons():
fp_ = 'annot_eLife_full/reg_elements_eLife_full_review_expanded.tsv'
df_regl = pd.read_csv(fp_, sep='\t')#.query('annot == "coding_promoter"').reset_index(drop=True)
addon_atac_height(df_regl)
addon_associated_gene_id(df_regl)
addon_cv(df_regl)
return df_regl
def df_regl_():
fp_ = 'annot_eLife_full/reg_elements_eLife_full_review_expanded.tsv'
df_regl = pd.read_csv(fp_, sep='\t')#.query('annot == "coding_promoter"').reset_index(drop=True)
return df_regl
# Intergenic associated_gene asignments look as intended, e.g.: chrI:9,789,072-9,815,313
#df_regl = regl_addons()
#fp_ = 'annot/S2_regulatory_annotation/_associated_locus_id.bed'
#df_regl[['chrom', 'start', 'end', 'associated_locus_id']].to_csv(fp_, sep='\t', index=False, header=False)
In [ ]:
def summit_pos(df): return pd.Series(map(int, df[['start', 'end']].mean(axis=1)))
def addon_H3K4me3(df_regl):
df_mean = pd.DataFrame()
df_max = pd.DataFrame()
for hmod in ['H3K4me3']:#, 'H3K4me1', 'H3K36me3', 'H3K27me3']:
for stage in config['stages_wt']:
fp_ = 'hmod_geo/tracks/%s_%s.bw' % (hmod, stage)
print(hmod, stage, os.path.isfile(fp_))
label = '%s\n%s' % (hmod, stage)
df_mean[label] = list(map(np.nanmean, yp.read_regions(fp_,
df_regl['chrom'], summit_pos(df_regl) - 250, summit_pos(df_regl) + 250)))
df_max[label] = list(map(np.nanmax, yp.read_regions(fp_,
df_regl['chrom'], summit_pos(df_regl) - 250, summit_pos(df_regl) + 250)))
df_regl['H3K4me3_mean_mean'] = df_mean.mean(axis=1)
df_regl['H3K4me3_mean_max'] = df_mean.max(axis=1)
df_regl['H3K4me3_max_mean'] = df_max.mean(axis=1)
df_regl['H3K4me3_max_max'] = df_max.max(axis=1)
def addon_H3K27me3(df_regl):
df_mean = pd.DataFrame()
df_max = pd.DataFrame()
for hmod in ['H3K27me3']:
for stage in config['stages_wt']:
fp_ = 'hmod_geo/tracks/%s_%s.bw' % (hmod, stage)
print(hmod, stage, os.path.isfile(fp_))
label = '%s\n%s' % (hmod, stage)
df_mean[label] = list(map(np.nanmean, yp.read_regions(fp_,
df_regl['chrom'], summit_pos(df_regl) - 250, summit_pos(df_regl) + 250)))
df_max[label] = list(map(np.nanmax, yp.read_regions(fp_,
df_regl['chrom'], summit_pos(df_regl) - 250, summit_pos(df_regl) + 250)))
df_regl['H3K27me3_mean_mean'] = df_mean.mean(axis=1)
df_regl['H3K27me3_mean_max'] = df_mean.max(axis=1)
df_regl['H3K27me3_max_mean'] = df_max.mean(axis=1)
df_regl['H3K27me3_max_max'] = df_max.max(axis=1)
def regl_Apr27(flank_len=None):
#fp_ = 'annot_Apr27/Fig2D2_regulatory_annotation_Apr27.tsv'
fp_ = 'annot_eLife_full/reg_elements_eLife_full_review_expanded.tsv'
df_regl = pd.read_csv(fp_, sep='\t')#.query('annot == "coding_promoter"').reset_index(drop=True)
#addon_atac_height(df_regl)
#addon_associated_gene_id(df_regl)
addon_cv(df_regl)
if not(flank_len is None):
pos_ = summit_pos(df_regl)
df_regl['start'] = pos_ - flank_len
df_regl['end'] = pos_ + flank_len + 1
return df_regl
In [ ]:
def locus_id_from_gene_id(l_gene_id):
# Attach display_id
def locus_id_(locus, sequence_id, gene_id):
if locus == locus:
return locus
elif sequence_id == sequence_id:
return sequence_id
else:
return gene_id
fp_geneIDs = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS260.geneIDs.txt.gz'
l_ = ['gene_id', 'locus', 'sequence_id', 'status']
df_geneIDs = pd.read_csv(fp_geneIDs, sep=',', names=('na', 'gene_id', 'locus', 'sequence_id', 'status'))[l_]
df_geneIDs['display_id'] = list(map(locus_id_,df_geneIDs['locus'], df_geneIDs['sequence_id'], df_geneIDs['gene_id']))
df_geneIDs = df_geneIDs.set_index('gene_id')
for gene_id in l_gene_id:
yield df_geneIDs.loc[gene_id]['display_id']
In [ ]:
def techreps_collapse(l_label_, include_raw=False):
def collapse_(s): return s if s[-1].isdigit() else s[:-1]
l_label = list(l_label_)
l_collapsed = list(sorted(set(map(collapse_, l_label))))
if include_raw:
return list(sorted(l_collapsed + l_label))
else:
return l_collapsed
def techreps_retrieve(sample, config_):
return list(sorted(v for k, v in config_.items() if k.startswith(sample)))