notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import numpy as np
import pandas as pd
import pysam



In [3]:

    
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant

samples = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
    's17': ['TTAGGC', 'GATCAG'],
    }

# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']



In [6]:

    
bamfile = pysam.AlignmentFile('../results/19255_ATCACG_H8UHCADXX_1_20140509B_20140509_sorted.bam', 'rb')
reference = bamfile.header['SQ'][0]['SN']
reference









    Out[6]:





'gi|556503834|ref|NC_000913.3|'



In [7]:

    
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
output_notebook()









    





    
        
        Loading BokehJS ...



In [4]:

    
from bokeh.plotting import vplot
from bokeh.palettes import brewer
from bokeh.models import FixedTicker, NumeralTickFormatter, Range1d, LinearAxis

strands_nobcm = {
    'plus': '../results/19255_plus.bam',
    'minus': '../results/19255_minus.bam'
}

strands_bcm = {
    'plus': '../results/19256_plus.bam',
    'minus': '../results/19256_minus.bam',
}


def strand_data(bamfiles, reference, region):
    '''
    Returns a dataframe with coverage data by strand for the given bamfiles,
    reference and region
    
    `bamfiles`: dictionary of the form {'plus': <filename>, 'minus': <filename>}
        where filename refers to the corresponding bamfile (strand-split)
    
    `reference`: reference, as specified in the bamfile header
    
    `region`: tuple (min, max) of the positions that define the region
    '''
    cov = []
    for strand,filename in bamfiles.items():
        bamfile = pysam.AlignmentFile(filename, 'rb')
        start,end = region
        pileup = bamfile.pileup(reference, start, end,
                               truncate=True,
                               max_depth=1e9)
        for col in pileup:
            cov.append((col.reference_pos, col.n, strand))
    df = pd.DataFrame.from_records(cov, columns=('position', 'coverage', 'strand'))
    df['coverage'] = np.where(df['strand'] == 'minus', -1*df['coverage'], df['coverage'])
    return df



def strand_coverage(strands, limits, title, anno=None):
    _strands = ['plus', 'minus'] 
    data = strand_data(strands, reference, limits)
    xs = []
    ys = []
    for strand in _strands:
        region = list(data[data['strand'] == strand]['position'])
        xs.append(np.hstack((region, region[::-1])))
        ys.append(np.hstack((data[data['strand']==strand]['coverage'], np.zeros(len(region)))))

    colors = brewer["Spectral"][4]
    alphas = [0.8, 0.5]
    max_yval = max([np.max(np.abs(y)) for y in ys])
    max_xval = max([np.max(x) + 100 for x in xs])
    min_xval = min([np.min(x) for x in xs])
    p = figure(title=title,
               y_range=(-max_yval, max_yval),
               x_range=(min_xval, max_xval+1))

    p.patches(xs, ys, color="#268bd2",
             alpha=alphas, line_color=None)
    
    if anno is not None:
        tracks = set(anno['track'])
        anno['trk'] = len(tracks) + 0.5
        anno.ix[anno.strand=='+', 'trk'] *= -1
        anno['trkname'] = anno['trk']
        for i,track in enumerate(tracks):
            anno.ix[(anno.track==track) & (anno.strand=='-'), 'trk'] -= i / 2
            anno.ix[(anno.track==track) & (anno.strand=='-'), 'trkname'] -= (i/2) + 0.25
            anno.ix[(anno.track==track) & (anno.strand=='+'), 'trk'] += i / 2
            anno.ix[(anno.track==track) & (anno.strand=='+'), 'trkname'] += (i/2) - 0.25
        p.extra_y_ranges = {'tracks': Range1d(start=-len(tracks)-1, end=len(tracks)+1)}
        p.add_layout(LinearAxis(y_range_name='tracks'), 'right')
        
        p.rect(x='center', y='trk', width='length', height=0.25, fill_color='blue',
              source=ColumnDataSource(anno), fill_alpha=0.2, y_range_name='tracks')
        p.text(x='center', y='trkname', text='name', text_align='center',
               source=ColumnDataSource(anno), y_range_name='tracks',
              text_font_size='10pt', text_baseline="middle")
        p.text(x=max_xval, y='trk', text='track', text_align='right',
               source=ColumnDataSource(anno), y_range_name='tracks', text_baseline='middle')
        
        mark_dist = (max_xval - min_xval) // 20
        marks = [[(i, ann['trk'], ann['strand']) for i in range(min_xval, max_xval, mark_dist)
                if ann['start'] < i < ann['end']] for _,ann in anno.iterrows()]
        
        for m in marks:
            p.text(x=[x[0] for x in m],
                   y=[x[1] for x in m],
                   text=['>' if x[2] == '+' else '<' for x in m],
                   text_align='right',
                   y_range_name='tracks', text_baseline='middle')
            

    p.plot_width = 900
    p.grid.grid_line_color = None
    p.yaxis.minor_tick_line_color = None
    p.yaxis[1].major_tick_line_color = None
    p.yaxis[1].major_label_text_color = None
    p.xaxis.minor_tick_line_color = None
    p.xaxis[0].ticker = FixedTicker(ticks=[limits[0], limits[0]+(limits[1]-limits[0])//2, limits[1]])
    p.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
    p.yaxis[0].formatter = NumeralTickFormatter(format="(0,0)")
    return p



In [5]:

    
def extract_gff(line, ftype='gene'):
    rec = {}
    fields = line.strip().split('\t')
    if fields[2] != ftype:
        return None
    opts = {}
    for f in fields[8].split(';'):
        k,v = f.split('=')
        opts[k] = v
        
    rec.update({
            'gene': opts['gene'],
            'start': int(fields[3]),
            'end': int(fields[4]),
            'strand': fields[6]
        })
    return rec


def extract_bed(line):
    rec = {}
    fields = line.strip().split('\t')
    rec.update({
            'TU': fields[3],
            'start': int(fields[1]),
            'end': int(fields[2])
        })
    return rec



def anno_df(annofile, extract=extract_gff):
    result = []
    with open(annofile) as fi:
        for line in fi:
            if not line.startswith('#'):
                rec = extract(line)
                if rec:
                    result.append(rec)
    return pd.DataFrame.from_records(result)



In [10]:

    
gff = anno_df('../ref/NC_000913.gff')
gff









    Out[10]:






  
    
      
      end
      gene
      start
      strand
    
  
  
    
      0
      255
      thrL
      190
      +
    
    
      1
      2799
      thrA
      337
      +
    
    
      2
      3733
      thrB
      2801
      +
    
    
      3
      5020
      thrC
      3734
      +
    
    
      4
      5530
      yaaX
      5234
      +
    
    
      5
      6459
      yaaA
      5683
      -
    
    
      6
      7959
      yaaJ
      6529
      -
    
    
      7
      9191
      talB
      8238
      +
    
    
      8
      9893
      mog
      9306
      +
    
    
      9
      10494
      yaaH
      9928
      -
    
    
      10
      11356
      yaaW
      10643
      -
    
    
      11
      11786
      yaaI
      11382
      -
    
    
      12
      14079
      dnaK
      12163
      +
    
    
      13
      15298
      dnaJ
      14168
      +
    
    
      14
      16557
      insL1
      15445
      +
    
    
      15
      16960
      mokC
      16751
      -
    
    
      16
      16903
      hokC
      16751
      -
    
    
      17
      17006
      sokC
      16952
      +
    
    
      18
      18655
      nhaA
      17489
      +
    
    
      19
      19620
      nhaR
      18715
      +
    
    
      20
      20314
      insB1
      19811
      -
    
    
      21
      20508
      insA
      20233
      -
    
    
      22
      21078
      rpsT
      20815
      -
    
    
      23
      21399
      yaaY
      21181
      +
    
    
      24
      22348
      ribF
      21407
      +
    
    
      25
      25207
      ileS
      22391
      +
    
    
      26
      25701
      lspA
      25207
      +
    
    
      27
      26275
      fkpB
      25826
      +
    
    
      28
      27227
      ispH
      26277
      +
    
    
      29
      28207
      rihC
      27293
      +
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      4486
      4611003
      prfC
      4609414
      +
    
    
      4487
      4612001
      osmY
      4611396
      +
    
    
      4488
      4612289
      ytjA
      4612128
      +
    
    
      4489
      4613484
      yjjU
      4612411
      +
    
    
      4490
      4614260
      yjjV
      4613481
      +
    
    
      4491
      4615543
      yjjW
      4614680
      -
    
    
      4492
      4617065
      yjjI
      4615515
      -
    
    
      4493
      4618102
      deoC
      4617323
      +
    
    
      4494
      4619551
      deoA
      4618229
      +
    
    
      4495
      4620826
      deoB
      4619603
      +
    
    
      4496
      4621602
      deoD
      4620883
      +
    
    
      4497
      4623100
      yjjJ
      4621769
      +
    
    
      4498
      4624117
      lplA
      4623101
      -
    
    
      4499
      4624789
      ytjB
      4624145
      -
    
    
      4500
      4625863
      serB
      4624895
      +
    
    
      4501
      4627294
      radA
      4625912
      +
    
    
      4502
      4628547
      nadR
      4627315
      +
    
    
      4503
      4630522
      yjjK
      4628855
      -
    
    
      4504
      4632670
      slt
      4630733
      +
    
    
      4505
      4633086
      trpR
      4632760
      +
    
    
      4506
      4633745
      yjjX
      4633233
      -
    
    
      4507
      4634444
      ytjC
      4633797
      +
    
    
      4508
      4635310
      rob
      4634441
      -
    
    
      4509
      4635994
      creA
      4635521
      +
    
    
      4510
      4636696
      creB
      4636007
      +
    
    
      4511
      4638120
      creC
      4636696
      +
    
    
      4512
      4639530
      creD
      4638178
      +
    
    
      4513
      4640306
      arcA
      4639590
      -
    
    
      4514
      4640542
      yjjY
      4640402
      +
    
    
      4515
      4641628
      yjtD
      4640942
      +
    
  

4516 rows × 4 columns



In [11]:

    
utr = anno_df('../ref/utrs_corr.bed', extract=extract_bed)
utr









    Out[11]:






  
    
      
      TU
      end
      start
    
  
  
    
      0
      thrL
      190
      148
    
    
      1
      thrL
      190
      148
    
    
      2
      yaaX
      5234
      5030
    
    
      3
      yaaA
      6587
      6459
    
    
      4
      yaaA
      6615
      6459
    
    
      5
      yaaJ
      8017
      7959
    
    
      6
      talB
      8238
      8191
    
    
      7
      htgA
      10830
      10643
    
    
      8
      htgA
      10830
      10644
    
    
      9
      yaaW
      11542
      11356
    
    
      10
      yaaI
      11825
      11786
    
    
      11
      yaaI
      11913
      11786
    
    
      12
      yaaI
      11938
      11786
    
    
      13
      dnaK
      12163
      12048
    
    
      14
      dnaK
      12163
      12123
    
    
      15
      dnaK
      12163
      12144
    
    
      16
      insL-1
      15445
      15380
    
    
      17
      insL-1
      15445
      15436
    
    
      18
      hokC
      16951
      16903
    
    
      19
      nhaA
      17489
      17317
    
    
      20
      nhaA
      17489
      17458
    
    
      21
      rpsT
      21120
      21078
    
    
      22
      rpsT
      21210
      21078
    
    
      23
      ribF
      21407
      21383
    
    
      24
      ileS
      22391
      21833
    
    
      25
      ileS
      22391
      22034
    
    
      26
      ileS
      22391
      22229
    
    
      27
      lspA
      25207
      25014
    
    
      28
      dapB
      28374
      28288
    
    
      29
      dapB
      28374
      28343
    
    
      ...
      ...
      ...
      ...
    
    
      3750
      prfC
      4609414
      4609344
    
    
      3751
      prfC
      4609414
      4609356
    
    
      3752
      osmY
      4611396
      4611153
    
    
      3753
      deoC
      4617323
      4616679
    
    
      3754
      deoC
      4617323
      4617278
    
    
      3755
      deoB
      4619603
      4619567
    
    
      3756
      yjjJ
      4621769
      4621657
    
    
      3757
      yjjJ
      4621769
      4621716
    
    
      3758
      lplA
      4624238
      4624117
    
    
      3759
      ytjB
      4624799
      4624789
    
    
      3760
      serB
      4624895
      4624856
    
    
      3761
      yjjK
      4630566
      4630522
    
    
      3762
      slt
      4630733
      4630700
    
    
      3763
      trpR
      4632760
      4632704
    
    
      3764
      yjjX
      4633773
      4633745
    
    
      3765
      yjjX
      4633899
      4633745
    
    
      3766
      creA
      4635521
      4635243
    
    
      3767
      rob
      4635353
      4635310
    
    
      3768
      creA
      4635521
      4635477
    
    
      3769
      creD
      4638178
      4638160
    
    
      3770
      yjjY
      4640402
      4640358
    
    
      3771
      arcA
      4640508
      4640306
    
    
      3772
      arcA
      4640512
      4640306
    
    
      3773
      arcA
      4640535
      4640306
    
    
      3774
      arcA
      4640599
      4640306
    
    
      3775
      arcA
      4640681
      4640306
    
    
      3776
      arcA
      4640688
      4640306
    
    
      3777
      arcA
      4640801
      4640306
    
    
      3778
      yjtD
      4640942
      4640838
    
    
      3779
      yjtD
      4640942
      4640898
    
  

3780 rows × 3 columns



In [12]:

    
gff_genes = set(gff['gene'])

def _strand(rec):
    if rec['TU'] in gff_genes:
        return gff[gff['gene'] == rec['TU']]['strand'].values[0]
    else: return '-'

utr['strand'] = utr.apply(_strand, axis=1)
utr









    Out[12]:






  
    
      
      TU
      end
      start
      strand
    
  
  
    
      0
      thrL
      190
      148
      +
    
    
      1
      thrL
      190
      148
      +
    
    
      2
      yaaX
      5234
      5030
      +
    
    
      3
      yaaA
      6587
      6459
      -
    
    
      4
      yaaA
      6615
      6459
      -
    
    
      5
      yaaJ
      8017
      7959
      -
    
    
      6
      talB
      8238
      8191
      +
    
    
      7
      htgA
      10830
      10643
      -
    
    
      8
      htgA
      10830
      10644
      -
    
    
      9
      yaaW
      11542
      11356
      -
    
    
      10
      yaaI
      11825
      11786
      -
    
    
      11
      yaaI
      11913
      11786
      -
    
    
      12
      yaaI
      11938
      11786
      -
    
    
      13
      dnaK
      12163
      12048
      +
    
    
      14
      dnaK
      12163
      12123
      +
    
    
      15
      dnaK
      12163
      12144
      +
    
    
      16
      insL-1
      15445
      15380
      -
    
    
      17
      insL-1
      15445
      15436
      -
    
    
      18
      hokC
      16951
      16903
      -
    
    
      19
      nhaA
      17489
      17317
      +
    
    
      20
      nhaA
      17489
      17458
      +
    
    
      21
      rpsT
      21120
      21078
      -
    
    
      22
      rpsT
      21210
      21078
      -
    
    
      23
      ribF
      21407
      21383
      +
    
    
      24
      ileS
      22391
      21833
      +
    
    
      25
      ileS
      22391
      22034
      +
    
    
      26
      ileS
      22391
      22229
      +
    
    
      27
      lspA
      25207
      25014
      +
    
    
      28
      dapB
      28374
      28288
      +
    
    
      29
      dapB
      28374
      28343
      +
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      3750
      prfC
      4609414
      4609344
      +
    
    
      3751
      prfC
      4609414
      4609356
      +
    
    
      3752
      osmY
      4611396
      4611153
      +
    
    
      3753
      deoC
      4617323
      4616679
      +
    
    
      3754
      deoC
      4617323
      4617278
      +
    
    
      3755
      deoB
      4619603
      4619567
      +
    
    
      3756
      yjjJ
      4621769
      4621657
      +
    
    
      3757
      yjjJ
      4621769
      4621716
      +
    
    
      3758
      lplA
      4624238
      4624117
      -
    
    
      3759
      ytjB
      4624799
      4624789
      -
    
    
      3760
      serB
      4624895
      4624856
      +
    
    
      3761
      yjjK
      4630566
      4630522
      -
    
    
      3762
      slt
      4630733
      4630700
      +
    
    
      3763
      trpR
      4632760
      4632704
      +
    
    
      3764
      yjjX
      4633773
      4633745
      -
    
    
      3765
      yjjX
      4633899
      4633745
      -
    
    
      3766
      creA
      4635521
      4635243
      +
    
    
      3767
      rob
      4635353
      4635310
      -
    
    
      3768
      creA
      4635521
      4635477
      +
    
    
      3769
      creD
      4638178
      4638160
      +
    
    
      3770
      yjjY
      4640402
      4640358
      +
    
    
      3771
      arcA
      4640508
      4640306
      -
    
    
      3772
      arcA
      4640512
      4640306
      -
    
    
      3773
      arcA
      4640535
      4640306
      -
    
    
      3774
      arcA
      4640599
      4640306
      -
    
    
      3775
      arcA
      4640681
      4640306
      -
    
    
      3776
      arcA
      4640688
      4640306
      -
    
    
      3777
      arcA
      4640801
      4640306
      -
    
    
      3778
      yjtD
      4640942
      4640838
      +
    
    
      3779
      yjtD
      4640942
      4640898
      +
    
  

3780 rows × 4 columns



In [18]:

    
# We need this for the reference name to match
utr['reference'] = 'NC_000913.3'
utr.to_csv('../ref/utrs_corr.stranded.bed', sep='\t',
          columns=['reference', 'start', 'end', 'strand', 'TU'],
          header=False, index=False)



In [19]:

    
!head ../ref/utrs_corr.stranded.bed









    



NC_000913.3	148	190	+	thrL
NC_000913.3	148	190	+	thrL
NC_000913.3	5030	5234	+	yaaX
NC_000913.3	6459	6587	-	yaaA
NC_000913.3	6459	6615	-	yaaA
NC_000913.3	7959	8017	-	yaaJ
NC_000913.3	8191	8238	+	talB
NC_000913.3	10643	10830	-	htgA
NC_000913.3	10644	10830	-	htgA
NC_000913.3	11356	11542	-	yaaW



In [6]:

    
def region_plot(feature, padding=50, limits=None):
    
    def _fcheck(rec):
        s = rec['TU'].replace('_', '-')
        return feature in s.split('-')
    
    vg = gff[gff['gene'] == feature][['start','end']].values
    vu = utr[utr.apply(_fcheck, axis=1)][['start','end']].values
    
    if limits is None:
        region = np.vstack((vg, vu))
        rmax = np.max(region)
        rmin = np.min(region)
        if padding is not None:
            rmax += padding
            rmin -= padding
        limits = (rmin, rmax)
    
    utrs = utr[utr.apply(_fcheck, axis=1)].copy()
    utrs['length'] = utrs['end'] - utrs['start']
    utrs.sort_values('length', ascending=False, inplace=True)
    
    utr_= utrs.iloc[0].to_frame().transpose()
    utr_['name'] = feature
    utr_['track'] = 'UTR'
    gene = gff[gff['gene'] == feature].copy()
    gene['track'] = 'gene'
    gene['name'] = feature
    
    genes = pd.concat([gene, utr_])
    genes['center'] = genes.start + (genes.end - genes.start) // 2
    genes['length'] = genes.end - genes.start

    p1 = strand_coverage(strands_nobcm, limits, "WT -BCM", genes)
    p2 = strand_coverage(strands_bcm, limits, "WT +BCM", genes)
    p = vplot(p1, p2)
    show(p)



In [74]:

    
region_plot('thiM', limits=(2184500, 2185460))



In [75]:

    
region_plot('lysC', limits=(4233100, 4233550))



In [79]:

    
region_plot('moaA', limits=(816850, 817350))



In [24]:

    
strands_nobcm = {
    'plus': '../results/19261_plus.bam',
    'minus': '../results/19261_minus.bam'
}

strands_bcm = {
    'plus': '../results/19262_plus.bam',
    'minus': '../results/19262_minus.bam',
}



In [25]:

    
region_plot('rpoS')

	end	gene	start	strand
0	255	thrL	190	+
1	2799	thrA	337	+
2	3733	thrB	2801	+
3	5020	thrC	3734	+
4	5530	yaaX	5234	+
5	6459	yaaA	5683	-
6	7959	yaaJ	6529	-
7	9191	talB	8238	+
8	9893	mog	9306	+
9	10494	yaaH	9928	-
10	11356	yaaW	10643	-
11	11786	yaaI	11382	-
12	14079	dnaK	12163	+
13	15298	dnaJ	14168	+
14	16557	insL1	15445	+
15	16960	mokC	16751	-
16	16903	hokC	16751	-
17	17006	sokC	16952	+
18	18655	nhaA	17489	+
19	19620	nhaR	18715	+
20	20314	insB1	19811	-
21	20508	insA	20233	-
22	21078	rpsT	20815	-
23	21399	yaaY	21181	+
24	22348	ribF	21407	+
25	25207	ileS	22391	+
26	25701	lspA	25207	+
27	26275	fkpB	25826	+
28	27227	ispH	26277	+
29	28207	rihC	27293	+
...	...	...	...	...
4486	4611003	prfC	4609414	+
4487	4612001	osmY	4611396	+
4488	4612289	ytjA	4612128	+
4489	4613484	yjjU	4612411	+
4490	4614260	yjjV	4613481	+
4491	4615543	yjjW	4614680	-
4492	4617065	yjjI	4615515	-
4493	4618102	deoC	4617323	+
4494	4619551	deoA	4618229	+
4495	4620826	deoB	4619603	+
4496	4621602	deoD	4620883	+
4497	4623100	yjjJ	4621769	+
4498	4624117	lplA	4623101	-
4499	4624789	ytjB	4624145	-
4500	4625863	serB	4624895	+
4501	4627294	radA	4625912	+
4502	4628547	nadR	4627315	+
4503	4630522	yjjK	4628855	-
4504	4632670	slt	4630733	+
4505	4633086	trpR	4632760	+
4506	4633745	yjjX	4633233	-
4507	4634444	ytjC	4633797	+
4508	4635310	rob	4634441	-
4509	4635994	creA	4635521	+
4510	4636696	creB	4636007	+
4511	4638120	creC	4636696	+
4512	4639530	creD	4638178	+
4513	4640306	arcA	4639590	-
4514	4640542	yjjY	4640402	+
4515	4641628	yjtD	4640942	+