In [ ]:
# Import built-in modules
import atexit
import bisect
import collections
import copy
import csv
import datetime
import hashlib
import datetime
import math
import io
import os
import random
import subprocess
import sys
import tempfile
import os
import re
import pickle
import shutil
import subprocess
import sys
import operator
import inspect
import itertools
import time
import __main__
import datetime
import inspect
import os
import pickle
import glob
import shutil
import subprocess
import sys
import gzip
import contextlib
import tempfile
from pprint import pprint
%config InlineBackend.figure_format='retina'
from functools import reduce
import functools
In [ ]:
import joblib
def pmap(f, l, n_jobs=10, verbose=5):
return joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(joblib.delayed(f)(l_i) for l_i in l)
In [ ]:
# Import custom modules
import numpy as np
import scipy as sp
import scipy.signal
import scipy.cluster.hierarchy
import sklearn
import pandas as pd
import pandas.plotting
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_venn
import mpl_toolkits.axes_grid1
from mpl_toolkits.axes_grid1 import ImageGrid
import HTSeq as hts
import hmmlearn
import hmmlearn.hmm
import pyBigWig
import pybedtools
from pybedtools import BedTool
import twobitreader
def listify(x):
if callable(getattr(self, "tolist", None)):
return x.tolist()
else:
return x
#def read_regions(fp, chroms, starts, ends, f=None):
# fh = pyBigWig.open(fp)
# for chrom, start_, end_ in zip(chroms, starts, ends):
# # fixes pyBigWig:
# # RuntimeError: You must supply a chromosome, start and end position.
# start = int(start_)
# end = int(end_)
# if f is None:
# yield fh.values(chrom, start, end)
# else:
# yield f(np.array(fh.values(chrom, start, end)))
#w fh.close()
In [ ]:
%matplotlib inline
In [ ]:
def pf(id, step, suffix, prefix=''): return os.path.join(prefix, step, '%(id)s.%(step)s%(suffix)s' % locals())
In [ ]:
os.chdir(os.path.expanduser('~/relmapping'))
print('os.getcwd():', os.getcwd())
In [ ]:
import yaml
with open('workflows/config.yaml') as fh:
config = collections.OrderedDict([(k, v) for k, v in yaml.load(fh).items()])
In [1]:
# Color-blindness safe colors from http://www.nature.com/nmeth/journal/v8/n6/full/nmeth.1618.html
# http://jfly.iam.u-tokyo.ac.jp/color/
BLACK = '#000000' # 0,0,0
ORANGE = '#e69f00' # 230,159,0
SKYBLUE = '#56b4e9' # 86,180,233
GREEN = '#009e73' # 0,158,115
YELLOW = '#f0e442' # 240,228,66
BLUE = '#0072b2' # 0,114,178
RED = '#d55e00' # 213,94,0
PURPLE = '#cc79a7' # 204,121,167
NAMES_BED9 = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb']
NAMES_GTF = ['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
In [ ]:
def write_gffbed(fp,
chrom, start, end, name='', attr=None, score=0, strand='.',
thickStart=None, thickEnd=None, itemRgb=BLUE,
trackline='#track gffTags=on', v=False):
df = pd.DataFrame()
def force_iter(l_):
try:
l = list(l_)
if (len(l) > 1) and not(type(l_) is str):
return l
else:
return list(itertools.repeat(l_, len(df)))
except TypeError:
return list(itertools.repeat(l_, len(df)))
#return list(l) if hasattr(l, '__iter__') else list(itertools.repeat(l, len(df)))
df['chrom'] = list(chrom)
df['start'] = force_iter(start)
df['end'] = force_iter(end)
def pack_row(ir): return (";".join([("%s=%s" % (k, v)).replace(" ", "%20") for k, v in zip(ir[1].index, ir[1])]))
attr_ = pd.concat([pd.DataFrame({'Name': force_iter(name)}), attr], axis=1)
df['name'] = list(map(pack_row, attr_.iterrows()))
df['score'] = force_iter(score)
df['strand'] = force_iter(strand)
if not(thickStart is None):
df['thickStart'] = force_iter(thickStart)
else:
df['thickStart'] = df['start'].copy().tolist()
if not(thickEnd is None):
df['thickEnd'] = force_iter(thickEnd)
else:
df['thickEnd'] = df['end'].copy().tolist()
df['itemRgb'] = force_iter(itemRgb)
with open(fp, 'w') as fh:
print(trackline, file=fh)
df.sort_values(['chrom', 'start', 'end']).to_csv(fh, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
if v: return df
def read_gffbed(fp, parse_attr=[], *args, **kwargs):
df = pd.read_csv(fp, sep='\t', names=NAMES_BED9, comment='#', *args, **kwargs)
return df_gfftags_unpack(df, name='name') # Does not preserve attribute order...
def df_gfftags_unpack(df, name='name'):
df_out = df.drop(name, 1)
df_name = pd.DataFrame(df[name].apply(hts.parse_GFF_attribute_string).tolist())
for col in df_name.columns:
df_out[col] = pd.to_numeric(df_name[col], errors='ignore', downcast='integer')
return df_out
def read_wbgtf(fp, parse_attr=[], coords_adj=False, *args, **kwargs):
df = pd.read_csv(fp, sep='\t', names=NAMES_GTF, comment='#', *args, **kwargs)
if coords_adj: # convert coordinates from GTF-style to BED-style
df['start'] = df['start'] - 1
if parse_attr:
return df_gfftags_unpack(df, name='attribute') # Does not preserve attribute order...
else:
return df
#df_attr_col = df_from_l_dict(map(dict_from_attr, df['attribute']))
#df_attr = pd.concat([df.drop('attribute', axis=1), df_attr_col], axis=1)
#return df_reorder_columns(df_attr, list(NAMES_GTF[:8]) + parse_attr)
In [ ]:
def deseq2x2_greater(df_counts, prefix=None):
fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
df_counts.to_csv(fp_inp, sep='\t')
!cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
#!wc -l {fp_inp}
#!wc -l {fp_out}
#!tail -n 20 {fp_out}
df_out = pd.read_csv(fp_out, sep='\s+')
!rm {fp_inp}
!rm {fp_out}
if not(prefix is None):
df_out.columns = [prefix + '_' + column for column in df_out.columns]
return df_out
def deseq2x2_greaterAbs(df_counts, prefix=None):
fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
df_counts.to_csv(fp_inp, sep='\t')
!cat {fp_inp} | scripts/deseq2x2_greaterAbs.R > {fp_out}
#!wc -l {fp_inp}
#!wc -l {fp_out}
#!tail -n 20 {fp_out}
df_out = pd.read_csv(fp_out, sep='\s+')
!rm {fp_inp}
!rm {fp_out}
if not(prefix is None):
df_out.columns = [prefix + '_' + column for column in df_out.columns]
return df_out
In [ ]:
def write_gffbed(fp,
chrom, start, end, name='', attr=None, score=0, strand='.',
thickStart=None, thickEnd=None, itemRgb=BLUE,
trackline='#track gffTags=on', v=False):
df = pd.DataFrame()
def force_iter(l_):
try:
l = list(l_)
if (len(l) > 1) and not(type(l_) is str):
return l
else:
return list(itertools.repeat(l_, len(df)))
except TypeError:
return list(itertools.repeat(l_, len(df)))
df['chrom'] = list(chrom)
df['start'] = force_iter(start)
df['end'] = force_iter(end)
def pack_row(ir): return (";".join([("%s=%s" % (k, v)).replace(" ", "%20") for k, v in zip(ir[1].index, ir[1])]))
attr_ = pd.concat([pd.DataFrame({'Name': force_iter(name)}), attr], axis=1)
df['name'] = list(map(pack_row, attr_.iterrows()))
df['score'] = force_iter(score)
df['strand'] = force_iter(strand)
if not(thickStart is None):
df['thickStart'] = force_iter(thickStart)
else:
df['thickStart'] = df['start'].copy().tolist()
if not(thickEnd is None):
df['thickEnd'] = force_iter(thickEnd)
else:
df['thickEnd'] = df['end'].copy().tolist()
df['itemRgb'] = force_iter(itemRgb)
with open(fp, 'w') as fh:
print(trackline, file=fh)
df.sort_values(['chrom', 'start', 'end']).to_csv(fh, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
if v: return df
In [ ]:
def read_regions(fp, chroms, starts, ends, v=False):
assert os.path.isfile(fp)
n_clip = 0
fh = pyBigWig.open(fp)
for chrom, start_, end_ in zip(chroms, starts, ends):
# fixes pyBigWig -- RuntimeError: You must supply a chromosome, start and end position.
start = int(start_)
end = int(end_)
# Clip region if necessary
if (0 <= start) and (end < fh.chroms(chrom)):
r = fh.values(chrom, start, end)#, numpy=True)
else:
r = np.zeros(end - start) # Should get the partial signal
n_clip += 1
yield np.array(r)
fh.close()
if v: print(n_clip, 'clipped regions')
In [ ]:
#def add_unique(df_a, df_b):
# assert (df_a.columns == df_b.columns).all()
# bt_b_only = BedTool.from_dataframe(df_b).subtract(b=BedTool.from_dataframe(df_a), A=True)
# df_b_only = pd.read_csv(bt_b_only.fn, sep='\t', names=df_a.columns)
# df_either = pd.concat([df_a, df_b_only], axis=0, ignore_index=True).sort_values(yp.NAMES_BED3).reset_index(drop=True)
# print('add_unique: %d peaks in merged set -- %d peaks from A plus %d of %d non-overlapping peaks from B' % (len(df_either), len(df_a), len(df_b_only), len(df_b)))
# return (df_either, df_b_only)