In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import pysam

In [3]:
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant

samples = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
    's17': ['TTAGGC', 'GATCAG'],
    }

# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']

In [6]:
bamfile = pysam.AlignmentFile('../results/19255_ATCACG_H8UHCADXX_1_20140509B_20140509_sorted.bam', 'rb')
reference = bamfile.header['SQ'][0]['SN']
reference


Out[6]:
'gi|556503834|ref|NC_000913.3|'

In [7]:
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
output_notebook()


Loading BokehJS ...

In [4]:
from bokeh.plotting import vplot
from bokeh.palettes import brewer
from bokeh.models import FixedTicker, NumeralTickFormatter, Range1d, LinearAxis

strands_nobcm = {
    'plus': '../results/19255_plus.bam',
    'minus': '../results/19255_minus.bam'
}

strands_bcm = {
    'plus': '../results/19256_plus.bam',
    'minus': '../results/19256_minus.bam',
}


def strand_data(bamfiles, reference, region):
    '''
    Returns a dataframe with coverage data by strand for the given bamfiles,
    reference and region
    
    `bamfiles`: dictionary of the form {'plus': <filename>, 'minus': <filename>}
        where filename refers to the corresponding bamfile (strand-split)
    
    `reference`: reference, as specified in the bamfile header
    
    `region`: tuple (min, max) of the positions that define the region
    '''
    cov = []
    for strand,filename in bamfiles.items():
        bamfile = pysam.AlignmentFile(filename, 'rb')
        start,end = region
        pileup = bamfile.pileup(reference, start, end,
                               truncate=True,
                               max_depth=1e9)
        for col in pileup:
            cov.append((col.reference_pos, col.n, strand))
    df = pd.DataFrame.from_records(cov, columns=('position', 'coverage', 'strand'))
    df['coverage'] = np.where(df['strand'] == 'minus', -1*df['coverage'], df['coverage'])
    return df



def strand_coverage(strands, limits, title, anno=None):
    _strands = ['plus', 'minus'] 
    data = strand_data(strands, reference, limits)
    xs = []
    ys = []
    for strand in _strands:
        region = list(data[data['strand'] == strand]['position'])
        xs.append(np.hstack((region, region[::-1])))
        ys.append(np.hstack((data[data['strand']==strand]['coverage'], np.zeros(len(region)))))

    colors = brewer["Spectral"][4]
    alphas = [0.8, 0.5]
    max_yval = max([np.max(np.abs(y)) for y in ys])
    max_xval = max([np.max(x) + 100 for x in xs])
    min_xval = min([np.min(x) for x in xs])
    p = figure(title=title,
               y_range=(-max_yval, max_yval),
               x_range=(min_xval, max_xval+1))

    p.patches(xs, ys, color="#268bd2",
             alpha=alphas, line_color=None)
    
    if anno is not None:
        tracks = set(anno['track'])
        anno['trk'] = len(tracks) + 0.5
        anno.ix[anno.strand=='+', 'trk'] *= -1
        anno['trkname'] = anno['trk']
        for i,track in enumerate(tracks):
            anno.ix[(anno.track==track) & (anno.strand=='-'), 'trk'] -= i / 2
            anno.ix[(anno.track==track) & (anno.strand=='-'), 'trkname'] -= (i/2) + 0.25
            anno.ix[(anno.track==track) & (anno.strand=='+'), 'trk'] += i / 2
            anno.ix[(anno.track==track) & (anno.strand=='+'), 'trkname'] += (i/2) - 0.25
        p.extra_y_ranges = {'tracks': Range1d(start=-len(tracks)-1, end=len(tracks)+1)}
        p.add_layout(LinearAxis(y_range_name='tracks'), 'right')
        
        p.rect(x='center', y='trk', width='length', height=0.25, fill_color='blue',
              source=ColumnDataSource(anno), fill_alpha=0.2, y_range_name='tracks')
        p.text(x='center', y='trkname', text='name', text_align='center',
               source=ColumnDataSource(anno), y_range_name='tracks',
              text_font_size='10pt', text_baseline="middle")
        p.text(x=max_xval, y='trk', text='track', text_align='right',
               source=ColumnDataSource(anno), y_range_name='tracks', text_baseline='middle')
        
        mark_dist = (max_xval - min_xval) // 20
        marks = [[(i, ann['trk'], ann['strand']) for i in range(min_xval, max_xval, mark_dist)
                if ann['start'] < i < ann['end']] for _,ann in anno.iterrows()]
        
        for m in marks:
            p.text(x=[x[0] for x in m],
                   y=[x[1] for x in m],
                   text=['>' if x[2] == '+' else '<' for x in m],
                   text_align='right',
                   y_range_name='tracks', text_baseline='middle')
            

    p.plot_width = 900
    p.grid.grid_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.yaxis[1].major_tick_line_color = None
    p.yaxis[1].major_label_text_color = None
    p.xaxis.minor_tick_line_color = None
    p.xaxis[0].ticker = FixedTicker(ticks=[limits[0], limits[0]+(limits[1]-limits[0])//2, limits[1]])
    p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
    p.yaxis[0].formatter = NumeralTickFormatter(format="(0,0)")
    return p

In [5]:
def extract_gff(line, ftype='gene'):
    rec = {}
    fields = line.strip().split('\t')
    if fields[2] != ftype:
        return None
    opts = {}
    for f in fields[8].split(';'):
        k,v = f.split('=')
        opts[k] = v
        
    rec.update({
            'gene': opts['gene'],
            'start': int(fields[3]),
            'end': int(fields[4]),
            'strand': fields[6]
        })
    return rec


def extract_bed(line):
    rec = {}
    fields = line.strip().split('\t')
    rec.update({
            'TU': fields[3],
            'start': int(fields[1]),
            'end': int(fields[2])
        })
    return rec



def anno_df(annofile, extract=extract_gff):
    result = []
    with open(annofile) as fi:
        for line in fi:
            if not line.startswith('#'):
                rec = extract(line)
                if rec:
                    result.append(rec)
    return pd.DataFrame.from_records(result)

In [10]:
gff = anno_df('../ref/NC_000913.gff')
gff


Out[10]:
end gene start strand
0 255 thrL 190 +
1 2799 thrA 337 +
2 3733 thrB 2801 +
3 5020 thrC 3734 +
4 5530 yaaX 5234 +
5 6459 yaaA 5683 -
6 7959 yaaJ 6529 -
7 9191 talB 8238 +
8 9893 mog 9306 +
9 10494 yaaH 9928 -
10 11356 yaaW 10643 -
11 11786 yaaI 11382 -
12 14079 dnaK 12163 +
13 15298 dnaJ 14168 +
14 16557 insL1 15445 +
15 16960 mokC 16751 -
16 16903 hokC 16751 -
17 17006 sokC 16952 +
18 18655 nhaA 17489 +
19 19620 nhaR 18715 +
20 20314 insB1 19811 -
21 20508 insA 20233 -
22 21078 rpsT 20815 -
23 21399 yaaY 21181 +
24 22348 ribF 21407 +
25 25207 ileS 22391 +
26 25701 lspA 25207 +
27 26275 fkpB 25826 +
28 27227 ispH 26277 +
29 28207 rihC 27293 +
... ... ... ... ...
4486 4611003 prfC 4609414 +
4487 4612001 osmY 4611396 +
4488 4612289 ytjA 4612128 +
4489 4613484 yjjU 4612411 +
4490 4614260 yjjV 4613481 +
4491 4615543 yjjW 4614680 -
4492 4617065 yjjI 4615515 -
4493 4618102 deoC 4617323 +
4494 4619551 deoA 4618229 +
4495 4620826 deoB 4619603 +
4496 4621602 deoD 4620883 +
4497 4623100 yjjJ 4621769 +
4498 4624117 lplA 4623101 -
4499 4624789 ytjB 4624145 -
4500 4625863 serB 4624895 +
4501 4627294 radA 4625912 +
4502 4628547 nadR 4627315 +
4503 4630522 yjjK 4628855 -
4504 4632670 slt 4630733 +
4505 4633086 trpR 4632760 +
4506 4633745 yjjX 4633233 -
4507 4634444 ytjC 4633797 +
4508 4635310 rob 4634441 -
4509 4635994 creA 4635521 +
4510 4636696 creB 4636007 +
4511 4638120 creC 4636696 +
4512 4639530 creD 4638178 +
4513 4640306 arcA 4639590 -
4514 4640542 yjjY 4640402 +
4515 4641628 yjtD 4640942 +

4516 rows × 4 columns


In [11]:
utr = anno_df('../ref/utrs_corr.bed', extract=extract_bed)
utr


Out[11]:
TU end start
0 thrL 190 148
1 thrL 190 148
2 yaaX 5234 5030
3 yaaA 6587 6459
4 yaaA 6615 6459
5 yaaJ 8017 7959
6 talB 8238 8191
7 htgA 10830 10643
8 htgA 10830 10644
9 yaaW 11542 11356
10 yaaI 11825 11786
11 yaaI 11913 11786
12 yaaI 11938 11786
13 dnaK 12163 12048
14 dnaK 12163 12123
15 dnaK 12163 12144
16 insL-1 15445 15380
17 insL-1 15445 15436
18 hokC 16951 16903
19 nhaA 17489 17317
20 nhaA 17489 17458
21 rpsT 21120 21078
22 rpsT 21210 21078
23 ribF 21407 21383
24 ileS 22391 21833
25 ileS 22391 22034
26 ileS 22391 22229
27 lspA 25207 25014
28 dapB 28374 28288
29 dapB 28374 28343
... ... ... ...
3750 prfC 4609414 4609344
3751 prfC 4609414 4609356
3752 osmY 4611396 4611153
3753 deoC 4617323 4616679
3754 deoC 4617323 4617278
3755 deoB 4619603 4619567
3756 yjjJ 4621769 4621657
3757 yjjJ 4621769 4621716
3758 lplA 4624238 4624117
3759 ytjB 4624799 4624789
3760 serB 4624895 4624856
3761 yjjK 4630566 4630522
3762 slt 4630733 4630700
3763 trpR 4632760 4632704
3764 yjjX 4633773 4633745
3765 yjjX 4633899 4633745
3766 creA 4635521 4635243
3767 rob 4635353 4635310
3768 creA 4635521 4635477
3769 creD 4638178 4638160
3770 yjjY 4640402 4640358
3771 arcA 4640508 4640306
3772 arcA 4640512 4640306
3773 arcA 4640535 4640306
3774 arcA 4640599 4640306
3775 arcA 4640681 4640306
3776 arcA 4640688 4640306
3777 arcA 4640801 4640306
3778 yjtD 4640942 4640838
3779 yjtD 4640942 4640898

3780 rows × 3 columns


In [12]:
gff_genes = set(gff['gene'])

def _strand(rec):
    if rec['TU'] in gff_genes:
        return gff[gff['gene'] == rec['TU']]['strand'].values[0]
    else: return '-'

utr['strand'] = utr.apply(_strand, axis=1)
utr


Out[12]:
TU end start strand
0 thrL 190 148 +
1 thrL 190 148 +
2 yaaX 5234 5030 +
3 yaaA 6587 6459 -
4 yaaA 6615 6459 -
5 yaaJ 8017 7959 -
6 talB 8238 8191 +
7 htgA 10830 10643 -
8 htgA 10830 10644 -
9 yaaW 11542 11356 -
10 yaaI 11825 11786 -
11 yaaI 11913 11786 -
12 yaaI 11938 11786 -
13 dnaK 12163 12048 +
14 dnaK 12163 12123 +
15 dnaK 12163 12144 +
16 insL-1 15445 15380 -
17 insL-1 15445 15436 -
18 hokC 16951 16903 -
19 nhaA 17489 17317 +
20 nhaA 17489 17458 +
21 rpsT 21120 21078 -
22 rpsT 21210 21078 -
23 ribF 21407 21383 +
24 ileS 22391 21833 +
25 ileS 22391 22034 +
26 ileS 22391 22229 +
27 lspA 25207 25014 +
28 dapB 28374 28288 +
29 dapB 28374 28343 +
... ... ... ... ...
3750 prfC 4609414 4609344 +
3751 prfC 4609414 4609356 +
3752 osmY 4611396 4611153 +
3753 deoC 4617323 4616679 +
3754 deoC 4617323 4617278 +
3755 deoB 4619603 4619567 +
3756 yjjJ 4621769 4621657 +
3757 yjjJ 4621769 4621716 +
3758 lplA 4624238 4624117 -
3759 ytjB 4624799 4624789 -
3760 serB 4624895 4624856 +
3761 yjjK 4630566 4630522 -
3762 slt 4630733 4630700 +
3763 trpR 4632760 4632704 +
3764 yjjX 4633773 4633745 -
3765 yjjX 4633899 4633745 -
3766 creA 4635521 4635243 +
3767 rob 4635353 4635310 -
3768 creA 4635521 4635477 +
3769 creD 4638178 4638160 +
3770 yjjY 4640402 4640358 +
3771 arcA 4640508 4640306 -
3772 arcA 4640512 4640306 -
3773 arcA 4640535 4640306 -
3774 arcA 4640599 4640306 -
3775 arcA 4640681 4640306 -
3776 arcA 4640688 4640306 -
3777 arcA 4640801 4640306 -
3778 yjtD 4640942 4640838 +
3779 yjtD 4640942 4640898 +

3780 rows × 4 columns


In [18]:
# We need this for the reference name to match
utr['reference'] = 'NC_000913.3'
utr.to_csv('../ref/utrs_corr.stranded.bed', sep='\t',
          columns=['reference', 'start', 'end', 'strand', 'TU'],
          header=False, index=False)

In [19]:
!head ../ref/utrs_corr.stranded.bed


NC_000913.3	148	190	+	thrL
NC_000913.3	148	190	+	thrL
NC_000913.3	5030	5234	+	yaaX
NC_000913.3	6459	6587	-	yaaA
NC_000913.3	6459	6615	-	yaaA
NC_000913.3	7959	8017	-	yaaJ
NC_000913.3	8191	8238	+	talB
NC_000913.3	10643	10830	-	htgA
NC_000913.3	10644	10830	-	htgA
NC_000913.3	11356	11542	-	yaaW

In [6]:
def region_plot(feature, padding=50, limits=None):
    
    def _fcheck(rec):
        s = rec['TU'].replace('_', '-')
        return feature in s.split('-')
    
    vg = gff[gff['gene'] == feature][['start','end']].values
    vu = utr[utr.apply(_fcheck, axis=1)][['start','end']].values
    
    if limits is None:
        region = np.vstack((vg, vu))
        rmax = np.max(region)
        rmin = np.min(region)
        if padding is not None:
            rmax += padding
            rmin -= padding
        limits = (rmin, rmax)
    
    utrs = utr[utr.apply(_fcheck, axis=1)].copy()
    utrs['length'] = utrs['end'] - utrs['start']
    utrs.sort_values('length', ascending=False, inplace=True)
    
    utr_= utrs.iloc[0].to_frame().transpose()
    utr_['name'] = feature
    utr_['track'] = 'UTR'
    gene = gff[gff['gene'] == feature].copy()
    gene['track'] = 'gene'
    gene['name'] = feature
    
    genes = pd.concat([gene, utr_])
    genes['center'] = genes.start + (genes.end - genes.start) // 2
    genes['length'] = genes.end - genes.start

    p1 = strand_coverage(strands_nobcm, limits, "WT -BCM", genes)
    p2 = strand_coverage(strands_bcm, limits, "WT +BCM", genes)
    p = vplot(p1, p2)
    show(p)

In [74]:
region_plot('thiM', limits=(2184500, 2185460))



In [75]:
region_plot('lysC', limits=(4233100, 4233550))



In [79]:
region_plot('moaA', limits=(816850, 817350))



In [24]:
strands_nobcm = {
    'plus': '../results/19261_plus.bam',
    'minus': '../results/19261_minus.bam'
}

strands_bcm = {
    'plus': '../results/19262_plus.bam',
    'minus': '../results/19262_minus.bam',
}

In [25]:
region_plot('rpoS')