In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import pysam
In [3]:
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant
samples = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
's17': ['TTAGGC', 'GATCAG'],
}
# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']
In [6]:
bamfile = pysam.AlignmentFile('../results/19255_ATCACG_H8UHCADXX_1_20140509B_20140509_sorted.bam', 'rb')
reference = bamfile.header['SQ'][0]['SN']
reference
Out[6]:
In [7]:
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
output_notebook()
In [4]:
from bokeh.plotting import vplot
from bokeh.palettes import brewer
from bokeh.models import FixedTicker, NumeralTickFormatter, Range1d, LinearAxis
strands_nobcm = {
'plus': '../results/19255_plus.bam',
'minus': '../results/19255_minus.bam'
}
strands_bcm = {
'plus': '../results/19256_plus.bam',
'minus': '../results/19256_minus.bam',
}
def strand_data(bamfiles, reference, region):
'''
Returns a dataframe with coverage data by strand for the given bamfiles,
reference and region
`bamfiles`: dictionary of the form {'plus': <filename>, 'minus': <filename>}
where filename refers to the corresponding bamfile (strand-split)
`reference`: reference, as specified in the bamfile header
`region`: tuple (min, max) of the positions that define the region
'''
cov = []
for strand,filename in bamfiles.items():
bamfile = pysam.AlignmentFile(filename, 'rb')
start,end = region
pileup = bamfile.pileup(reference, start, end,
truncate=True,
max_depth=1e9)
for col in pileup:
cov.append((col.reference_pos, col.n, strand))
df = pd.DataFrame.from_records(cov, columns=('position', 'coverage', 'strand'))
df['coverage'] = np.where(df['strand'] == 'minus', -1*df['coverage'], df['coverage'])
return df
def strand_coverage(strands, limits, title, anno=None):
_strands = ['plus', 'minus']
data = strand_data(strands, reference, limits)
xs = []
ys = []
for strand in _strands:
region = list(data[data['strand'] == strand]['position'])
xs.append(np.hstack((region, region[::-1])))
ys.append(np.hstack((data[data['strand']==strand]['coverage'], np.zeros(len(region)))))
colors = brewer["Spectral"][4]
alphas = [0.8, 0.5]
max_yval = max([np.max(np.abs(y)) for y in ys])
max_xval = max([np.max(x) + 100 for x in xs])
min_xval = min([np.min(x) for x in xs])
p = figure(title=title,
y_range=(-max_yval, max_yval),
x_range=(min_xval, max_xval+1))
p.patches(xs, ys, color="#268bd2",
alpha=alphas, line_color=None)
if anno is not None:
tracks = set(anno['track'])
anno['trk'] = len(tracks) + 0.5
anno.ix[anno.strand=='+', 'trk'] *= -1
anno['trkname'] = anno['trk']
for i,track in enumerate(tracks):
anno.ix[(anno.track==track) & (anno.strand=='-'), 'trk'] -= i / 2
anno.ix[(anno.track==track) & (anno.strand=='-'), 'trkname'] -= (i/2) + 0.25
anno.ix[(anno.track==track) & (anno.strand=='+'), 'trk'] += i / 2
anno.ix[(anno.track==track) & (anno.strand=='+'), 'trkname'] += (i/2) - 0.25
p.extra_y_ranges = {'tracks': Range1d(start=-len(tracks)-1, end=len(tracks)+1)}
p.add_layout(LinearAxis(y_range_name='tracks'), 'right')
p.rect(x='center', y='trk', width='length', height=0.25, fill_color='blue',
source=ColumnDataSource(anno), fill_alpha=0.2, y_range_name='tracks')
p.text(x='center', y='trkname', text='name', text_align='center',
source=ColumnDataSource(anno), y_range_name='tracks',
text_font_size='10pt', text_baseline="middle")
p.text(x=max_xval, y='trk', text='track', text_align='right',
source=ColumnDataSource(anno), y_range_name='tracks', text_baseline='middle')
mark_dist = (max_xval - min_xval) // 20
marks = [[(i, ann['trk'], ann['strand']) for i in range(min_xval, max_xval, mark_dist)
if ann['start'] < i < ann['end']] for _,ann in anno.iterrows()]
for m in marks:
p.text(x=[x[0] for x in m],
y=[x[1] for x in m],
text=['>' if x[2] == '+' else '<' for x in m],
text_align='right',
y_range_name='tracks', text_baseline='middle')
p.plot_width = 900
p.grid.grid_line_color = None
p.yaxis.minor_tick_line_color = None
p.yaxis[1].major_tick_line_color = None
p.yaxis[1].major_label_text_color = None
p.xaxis.minor_tick_line_color = None
p.xaxis[0].ticker = FixedTicker(ticks=[limits[0], limits[0]+(limits[1]-limits[0])//2, limits[1]])
p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
p.yaxis[0].formatter = NumeralTickFormatter(format="(0,0)")
return p
In [5]:
def extract_gff(line, ftype='gene'):
rec = {}
fields = line.strip().split('\t')
if fields[2] != ftype:
return None
opts = {}
for f in fields[8].split(';'):
k,v = f.split('=')
opts[k] = v
rec.update({
'gene': opts['gene'],
'start': int(fields[3]),
'end': int(fields[4]),
'strand': fields[6]
})
return rec
def extract_bed(line):
rec = {}
fields = line.strip().split('\t')
rec.update({
'TU': fields[3],
'start': int(fields[1]),
'end': int(fields[2])
})
return rec
def anno_df(annofile, extract=extract_gff):
result = []
with open(annofile) as fi:
for line in fi:
if not line.startswith('#'):
rec = extract(line)
if rec:
result.append(rec)
return pd.DataFrame.from_records(result)
In [10]:
gff = anno_df('../ref/NC_000913.gff')
gff
Out[10]:
In [11]:
utr = anno_df('../ref/utrs_corr.bed', extract=extract_bed)
utr
Out[11]:
In [12]:
gff_genes = set(gff['gene'])
def _strand(rec):
if rec['TU'] in gff_genes:
return gff[gff['gene'] == rec['TU']]['strand'].values[0]
else: return '-'
utr['strand'] = utr.apply(_strand, axis=1)
utr
Out[12]:
In [18]:
# We need this for the reference name to match
utr['reference'] = 'NC_000913.3'
utr.to_csv('../ref/utrs_corr.stranded.bed', sep='\t',
columns=['reference', 'start', 'end', 'strand', 'TU'],
header=False, index=False)
In [19]:
!head ../ref/utrs_corr.stranded.bed
In [6]:
def region_plot(feature, padding=50, limits=None):
def _fcheck(rec):
s = rec['TU'].replace('_', '-')
return feature in s.split('-')
vg = gff[gff['gene'] == feature][['start','end']].values
vu = utr[utr.apply(_fcheck, axis=1)][['start','end']].values
if limits is None:
region = np.vstack((vg, vu))
rmax = np.max(region)
rmin = np.min(region)
if padding is not None:
rmax += padding
rmin -= padding
limits = (rmin, rmax)
utrs = utr[utr.apply(_fcheck, axis=1)].copy()
utrs['length'] = utrs['end'] - utrs['start']
utrs.sort_values('length', ascending=False, inplace=True)
utr_= utrs.iloc[0].to_frame().transpose()
utr_['name'] = feature
utr_['track'] = 'UTR'
gene = gff[gff['gene'] == feature].copy()
gene['track'] = 'gene'
gene['name'] = feature
genes = pd.concat([gene, utr_])
genes['center'] = genes.start + (genes.end - genes.start) // 2
genes['length'] = genes.end - genes.start
p1 = strand_coverage(strands_nobcm, limits, "WT -BCM", genes)
p2 = strand_coverage(strands_bcm, limits, "WT +BCM", genes)
p = vplot(p1, p2)
show(p)
In [74]:
region_plot('thiM', limits=(2184500, 2185460))
In [75]:
region_plot('lysC', limits=(4233100, 4233550))
In [79]:
region_plot('moaA', limits=(816850, 817350))
In [24]:
strands_nobcm = {
'plus': '../results/19261_plus.bam',
'minus': '../results/19261_minus.bam'
}
strands_bcm = {
'plus': '../results/19262_plus.bam',
'minus': '../results/19262_minus.bam',
}
In [25]:
region_plot('rpoS')