In [1]:
%matplotlib inline

In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from ggplot import *

import os
import sys

In [3]:
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant

samples = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
    's17': ['TTAGGC', 'GATCAG'],
    }

# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']

In [4]:
def extract_gff(line, ftype='gene'):
    rec = {}
    fields = line.strip().split('\t')
    if fields[2] != ftype:
        return None
    opts = {}
    for f in fields[8].split(';'):
        k,v = f.split('=')
        opts[k] = v
        
    rec.update({
            'gene': opts['gene'],
            'start': int(fields[3]),
            'end': int(fields[4]),
            'strand': fields[6]
        })
    return rec


def extract_bed(line):
    rec = {}
    fields = line.strip().split('\t')
    rec.update({
            'TU': ''.join([x for x in fields[3].split('-')[2:]]),
            'start': int(fields[1]),
            'end': int(fields[2]),
            'strand': fields[5]
        })
    return rec



def anno_df(annofile, extract=extract_gff):
    result = []
    with open(annofile) as fi:
        for line in fi:
            if not line.startswith('#'):
                rec = extract(line)
                if rec:
                    result.append(rec)
    return pd.DataFrame.from_records(result)

In [5]:
gff = anno_df('../ref/NC_000913.gff')
gff


Out[5]:
end gene start strand
0 255 thrL 190 +
1 2799 thrA 337 +
2 3733 thrB 2801 +
3 5020 thrC 3734 +
4 5530 yaaX 5234 +
5 6459 yaaA 5683 -
6 7959 yaaJ 6529 -
7 9191 talB 8238 +
8 9893 mog 9306 +
9 10494 yaaH 9928 -
10 11356 yaaW 10643 -
11 11786 yaaI 11382 -
12 14079 dnaK 12163 +
13 15298 dnaJ 14168 +
14 16557 insL1 15445 +
15 16960 mokC 16751 -
16 16903 hokC 16751 -
17 17006 sokC 16952 +
18 18655 nhaA 17489 +
19 19620 nhaR 18715 +
20 20314 insB1 19811 -
21 20508 insA 20233 -
22 21078 rpsT 20815 -
23 21399 yaaY 21181 +
24 22348 ribF 21407 +
25 25207 ileS 22391 +
26 25701 lspA 25207 +
27 26275 fkpB 25826 +
28 27227 ispH 26277 +
29 28207 rihC 27293 +
... ... ... ... ...
4486 4611003 prfC 4609414 +
4487 4612001 osmY 4611396 +
4488 4612289 ytjA 4612128 +
4489 4613484 yjjU 4612411 +
4490 4614260 yjjV 4613481 +
4491 4615543 yjjW 4614680 -
4492 4617065 yjjI 4615515 -
4493 4618102 deoC 4617323 +
4494 4619551 deoA 4618229 +
4495 4620826 deoB 4619603 +
4496 4621602 deoD 4620883 +
4497 4623100 yjjJ 4621769 +
4498 4624117 lplA 4623101 -
4499 4624789 ytjB 4624145 -
4500 4625863 serB 4624895 +
4501 4627294 radA 4625912 +
4502 4628547 nadR 4627315 +
4503 4630522 yjjK 4628855 -
4504 4632670 slt 4630733 +
4505 4633086 trpR 4632760 +
4506 4633745 yjjX 4633233 -
4507 4634444 ytjC 4633797 +
4508 4635310 rob 4634441 -
4509 4635994 creA 4635521 +
4510 4636696 creB 4636007 +
4511 4638120 creC 4636696 +
4512 4639530 creD 4638178 +
4513 4640306 arcA 4639590 -
4514 4640542 yjjY 4640402 +
4515 4641628 yjtD 4640942 +

4516 rows × 4 columns


In [6]:
df = pd.read_table('../ref/UTR_5_3_sequence.new.bed', sep='\t',
                            header=None,
                            names=['TSS', 'TU_name', 'coord_3', 'coord_5', 'first_gene_3',
                                   'first_gene_5', 'first_gene_name', 'operon', 'promoter',
                                   'seq_5UTR', 'strand'])
df


Out[6]:
TSS TU_name coord_3 coord_5 first_gene_3 first_gene_5 first_gene_name operon promoter seq_5UTR strand
0 148 thrLABC 190 148 255 190 thrL thrLABC thrLp ATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCA forward
1 148 thrL 190 148 255 190 thrL thrLABC thrLp ATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCA forward
2 5030 yaaX 5234 5030 5530 5234 yaaX yaaX yaaXp4 ATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGCCCGGCTTTTT... forward
3 6587 yaaA 6587 6459 6459 5683 yaaA yaaA yaaAp3 ATCCGGATATCGGTCGCCAGCTTTCTCCGGACGCGTGGGATGATGT... reverse
4 6615 yaaA 6615 6459 6459 5683 yaaA yaaA yaaAp5 GTGCGCCCGGTGTTTGATCCATTGCGTTATCCGGATATCGGTCGCC... reverse
5 8017 yaaJ 8017 7959 7959 6529 yaaJ yaaJ yaaJp4 AACAATTGGTAACGTTTACACAGGAAAGTCATCGCGACCGGCAATA... reverse
6 8191 talB 8238 8191 9191 8238 talB talB talBp AGACCGGTTACATCCCCCTAACAAGCTGTTTAAAGAGAAATACTATCA forward
7 10643 htgA 10830 10643 11315 10830 htgA htgA htgAp2 TCAGACCTGAGTGGCGCTAACCATCCGGCGCAGGCAGGCGATTTGC... forward
8 10644 htgA 10830 10644 11315 10830 htgA htgA htgAp1 CAGACCTGAGTGGCGCTAACCATCCGGCGCAGGCAGGCGATTTGCA... forward
9 11542 yaaW 11542 11356 11356 10643 yaaW yaaW yaaWp3 GCCTGAATATTCCTTCAGAAATAAAAGAAGGGCAAACCACTGACTG... reverse
10 11825 yaaI 11825 11786 11786 11382 yaaI yaaI yaaIp4 GTATTCATCCCCCTACCTCTTCCCACTAAGAGAATCCTTA reverse
11 11913 yaaI 11913 11786 11786 11382 yaaI yaaI yaaIp3 ATCGCAGCCAAGGCATTCATCAAAAAATTGTAATAAAAAGAAAAGA... reverse
12 11938 yaaI 11938 11786 11786 11382 yaaI yaaI yaaIp2 GCAATTTTATTCATATAAAGAATGAATCGCAGCCAAGGCATTCATC... reverse
13 12048 dnaK-tpke11-dnaJ 12163 12048 14079 12163 dnaK dnaK-tpke11-dnaJ dnaKp1 AACCGCAGTGAGTGAGTCTGCAAAAAAATGAAATTGGGCAGTTGAA... forward
14 12123 dnaK-tpke11-dnaJ 12163 12123 14079 12163 dnaK dnaK-tpke11-dnaJ dnaKp2 ACAACCACATGATGACCGAATATATAGTGGAGACGTTTAGA forward
15 12144 dnaK-tpke11-dnaJ 12163 12144 14079 12163 dnaK dnaK-tpke11-dnaJ dnaKp3 ATATAGTGGAGACGTTTAGA forward
16 15380 insL-1 15445 15380 16557 15445 insL-1 insL-1 insLp5 GGATCACTCCCATAAGCGCTAACTTAAGGGTTGTGGTATTACGCCT... forward
17 15436 insL-1 15445 15436 16557 15445 insL-1 insL-1 insLp6 AACGTGCCGA forward
18 16951 hokC 16951 16903 16903 16751 hokC mokC-hokC hokCp4 ACATGTAGAGTGCCTCTTACTGACCGTAAGGTCAAGGAGAAGAGAGCAA reverse
19 17317 nhaAR 17489 17317 18655 17489 nhaA nhaAR nhaAp2 GGTCACTCGTGAGCGCTTACAGCCGTCAAAAACGCATCTCACCGCT... forward
20 17458 nhaAR 17489 17458 18655 17489 nhaA nhaAR nhaAp1 ACGATCTATTCACCTGAAAGAGAAATAAAAAG forward
21 21120 rpsT 21120 21078 21078 20815 rpsT rpsT rpsTp2 GCCTTTGAATTGTCCATATAGAACACATTTGGGAGTTGGACCT reverse
22 21210 rpsT 21210 21078 21078 20815 rpsT rpsT rpsTp1 GCCATCACTACGTAACGAGTGCCGGCACATTAACGGCGCTTATTTG... reverse
23 21383 ribF-ileS-lspA-fkpB-ispH 21407 21383 22348 21407 ribF ribF-ileS-lspA-fkpB-ispH ribFp GATTTTCACTGTTTTGAGCCAGACA forward
24 21833 ileS-lspA-fkpB-ispH 22391 21833 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp1 GCTGGCATGGAATACGGCTTCGATATCACCAGTACGCAAACTTTTT... forward
25 22034 ileS-lspA-fkpB-ispH 22391 22034 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp2 GCGAATGTACCGCTGCGCCGTCAGGTTTCCCCGGTGAAAGGGGTTT... forward
26 22229 ileS-lspA-fkpB-ispH 22391 22229 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp3 GTGCTGCGTAAAAAAATACGCAATGAGCAGCGATTTGCGTCGCTGG... forward
27 25014 lspA-fkpB-ispH 25207 25014 25701 25207 lspA ribF-ileS-lspA-fkpB-ispH lspAp ACGCACCTGCTGATGCTCAGCAGAGCGAAGTACTCAAAGGGCTGAA... forward
28 28288 dapB 28374 28288 29195 28374 dapB dapB dapBp2 TAATTATCAGCGTTTTTGGCTGGCGGCGTAGCGATGCGCTGGTTAC... forward
29 28343 dapB 28374 28343 29195 28374 dapB dapB dapBp1 GGTCTATGCAAATTAACAAAAGAGAATAGCTA forward
... ... ... ... ... ... ... ... ... ... ... ...
3750 4609344 prfC 4609414 4609344 4611003 4609414 prfC prfC prfCp2 GGTAAAATAGCCGCAATTTTTCGTTTTCAACAAGCGCGGCGCGATG... forward
3751 4609356 prfC 4609414 4609356 4611003 4609414 prfC prfC prfCp4 GCAATTTTTCGTTTTCAACAAGCGCGGCGCGATGCCGCTTACTCAA... forward
3752 4611153 osmY 4611396 4611153 4612001 4611396 osmY osmY osmYp GTGATGACATTTCTGACGGCGTTAAATACCGTTCAATGCGTAGATA... forward
3753 4616679 deoCABD 4617323 4616679 4618102 4617323 deoC deoCABD deoCp1 ATACGGTTGCAACAACGCATCCAGTTGCCCCAGGTAGACCGGCATC... forward
3754 4617278 deoCABD 4617323 4617278 4618102 4617323 deoC deoCABD deoCp2 AAACTCGCAAGGTGAATTTTATTGGCGACAAGCCAGGAGAATGAAA forward
3755 4619567 deoBD 4619603 4619567 4620826 4619603 deoB deoCABD deoBp ATCATTTAAATTTGAAGCACTGAGTACGGAGAACATA forward
3756 4621657 yjjJ 4621769 4621657 4623100 4621769 yjjJ yjjJ yjjJp2 GGCTTTTTAGTATCTATTCATTTTTCTCTCCAGCTTGAATATTTTC... forward
3757 4621716 yjjJ 4621769 4621716 4623100 4621769 yjjJ yjjJ yjjJp4 GTGAAATGTGTTAATAAATCTATTCAAGTATCTATTCACGAATCTA... forward
3758 4624238 lplA 4624238 4624117 4624117 4623101 lplA ytjB-lplA lplAp2 ACAGGGTAAACGCACCCGCTGGCAGCAATCGCCCTTCCTGTTAACC... reverse
3759 4624799 ytjB-lplA 4624799 4624789 4624789 4624145 ytjB ytjB-lplA ytjBp AGGGTGGAAGA reverse
3760 4624856 serB-radA 4624895 4624856 4625863 4624895 serB serB-radA-nadR serBp GTTATTTTCCCTGCTTCGAACGATTTTACAGGAGCCTTAA forward
3761 4630566 yjjK 4630566 4630522 4630522 4628855 yjjK yjjK yjjKp11 TTTGAAAACTCATTATCACGATAAACATAGAGGCGAAGTCCAACG reverse
3762 4630700 slt 4630733 4630700 4632670 4630733 slt slt sltp7 GCATTGATGTATTTACACTTAGAGGATGCGCTTG forward
3763 4632704 trpR 4632760 4632704 4633086 4632760 trpR trpR trpRp AGCGAGTACAACCGGGGGAGGCATTTTGCTTCCCCCGCTAACAATG... forward
3764 4633773 yjjX 4633773 4633745 4633745 4633233 yjjX yjjX yjjXp5 AACGTTAATTTTCTGAGTAATGCTGATTA reverse
3765 4633899 yjjX 4633899 4633745 4633745 4633233 yjjX yjjX yjjXp6 GTTGCTCACCTTTGGCGGTCAGCGGGCTGTCAGACTGGCCCTGAAT... reverse
3766 4635243 creABC 4635521 4635243 4635994 4635521 creA creABCD creAp2 GACAGGGGCTGATCCAGATGACCTTCCAGCCAGATTAAAAGGTCGC... forward
3767 4635353 rob 4635353 4635310 4635310 4634441 rob rob robp ACCTGATGTCAGGTGCTCGTTGTTGAAAGGATGAGGATATTTTA reverse
3768 4635477 creABCD 4635521 4635477 4635994 4635521 creA creABCD creAp GTGACGAACTAATTGCTCGTGTAATAGATAAAAATGGTAACAATA forward
3769 4638160 creD 4638178 4638160 4639530 4638178 creD creABCD creDp ATTGCAAAGGAGAAGACTA forward
3770 4640358 yjjY 4640402 4640358 4640542 4640402 yjjY yjjY yjjYp6 TGACCTGCCTGATGCATGCTGCAAATTAACATGATCGGCGTAACA forward
3771 4640508 arcA 4640508 4640306 4640306 4639590 arcA arcA arcAp1 GTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACCAAGC... reverse
3772 4640512 arcA 4640512 4640306 4640306 4639590 arcA arcA arcAp2 AGCCGTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACC... reverse
3773 4640535 arcA 4640535 4640306 4640306 4639590 arcA arcA arcAp3 GCTGTTAAAATGGTTAGGATGACAGCCGTTTTTGACACTGTCGGGT... reverse
3774 4640599 arcA 4640599 4640306 4640306 4639590 arcA arcA arcAp4 ACTTGATATATGTCAACGAAGCGTAGTTTTATTGGGTGTCCGGCCC... reverse
3775 4640681 arcA 4640681 4640306 4640306 4639590 arcA arcA arcAp5 TAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATTTAGATAC... reverse
3776 4640688 arcA 4640688 4640306 4640306 4639590 arcA arcA arcAp6 ATGCAACTAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATT... reverse
3777 4640801 arcA 4640801 4640306 4640306 4639590 arcA arcA arcAp7 CTGTACTAACGGTTGAGTTGTTAAAAAATGCTACATATCCTTCTGT... reverse
3778 4640838 yjtD 4640942 4640838 4641628 4640942 yjtD yjtD yjtDp8 GGGCTTTTTCTGCGACTTACGTTAAGAATTTGTAAATTCGCACCGC... forward
3779 4640898 yjtD 4640942 4640898 4641628 4640942 yjtD yjtD yjtDp6 GTGATCACCCGGTTCGCGGTTATTTGATCAAGAAGAGTGGCAATA forward

3780 rows × 11 columns


In [7]:
utrs = df[['TSS', 'coord_3', 'coord_5', 'first_gene_name', 'strand']].copy()
utrs.rename(columns={'coord_3': 'end', 'coord_5': 'start', 'first_gene_name': 'gene'}, inplace=True)
utrs.loc[utrs['strand'] == 'forward', 'strand'] = '+'
utrs.loc[utrs['strand'] == 'reverse', 'strand'] = '-'
utrs


Out[7]:
TSS end start gene strand
0 148 190 148 thrL +
1 148 190 148 thrL +
2 5030 5234 5030 yaaX +
3 6587 6587 6459 yaaA -
4 6615 6615 6459 yaaA -
5 8017 8017 7959 yaaJ -
6 8191 8238 8191 talB +
7 10643 10830 10643 htgA +
8 10644 10830 10644 htgA +
9 11542 11542 11356 yaaW -
10 11825 11825 11786 yaaI -
11 11913 11913 11786 yaaI -
12 11938 11938 11786 yaaI -
13 12048 12163 12048 dnaK +
14 12123 12163 12123 dnaK +
15 12144 12163 12144 dnaK +
16 15380 15445 15380 insL-1 +
17 15436 15445 15436 insL-1 +
18 16951 16951 16903 hokC -
19 17317 17489 17317 nhaA +
20 17458 17489 17458 nhaA +
21 21120 21120 21078 rpsT -
22 21210 21210 21078 rpsT -
23 21383 21407 21383 ribF +
24 21833 22391 21833 ileS +
25 22034 22391 22034 ileS +
26 22229 22391 22229 ileS +
27 25014 25207 25014 lspA +
28 28288 28374 28288 dapB +
29 28343 28374 28343 dapB +
... ... ... ... ... ...
3750 4609344 4609414 4609344 prfC +
3751 4609356 4609414 4609356 prfC +
3752 4611153 4611396 4611153 osmY +
3753 4616679 4617323 4616679 deoC +
3754 4617278 4617323 4617278 deoC +
3755 4619567 4619603 4619567 deoB +
3756 4621657 4621769 4621657 yjjJ +
3757 4621716 4621769 4621716 yjjJ +
3758 4624238 4624238 4624117 lplA -
3759 4624799 4624799 4624789 ytjB -
3760 4624856 4624895 4624856 serB +
3761 4630566 4630566 4630522 yjjK -
3762 4630700 4630733 4630700 slt +
3763 4632704 4632760 4632704 trpR +
3764 4633773 4633773 4633745 yjjX -
3765 4633899 4633899 4633745 yjjX -
3766 4635243 4635521 4635243 creA +
3767 4635353 4635353 4635310 rob -
3768 4635477 4635521 4635477 creA +
3769 4638160 4638178 4638160 creD +
3770 4640358 4640402 4640358 yjjY +
3771 4640508 4640508 4640306 arcA -
3772 4640512 4640512 4640306 arcA -
3773 4640535 4640535 4640306 arcA -
3774 4640599 4640599 4640306 arcA -
3775 4640681 4640681 4640306 arcA -
3776 4640688 4640688 4640306 arcA -
3777 4640801 4640801 4640306 arcA -
3778 4640838 4640942 4640838 yjtD +
3779 4640898 4640942 4640898 yjtD +

3780 rows × 5 columns


In [8]:
# 5' UTR annotations
"""
res = []
with open('../../results/redux/utrs_corr.bed', 'r') as fi:
    for line in fi:
        fields = line.strip().split()
        res.append({
                'gene': fields[3],
                'start': int(fields[1]),
                'end': int(fields[2]),
            })
 
utrs = pd.DataFrame.from_records(res)
utrs['UTR_length'] = utrs.end - utrs.start
utrs
"""


Out[8]:
"\nres = []\nwith open('../../results/redux/utrs_corr.bed', 'r') as fi:\n    for line in fi:\n        fields = line.strip().split()\n        res.append({\n                'gene': fields[3],\n                'start': int(fields[1]),\n                'end': int(fields[2]),\n            })\n \nutrs = pd.DataFrame.from_records(res)\nutrs['UTR_length'] = utrs.end - utrs.start\nutrs\n"

In [9]:
dfa = utrs.merge(gff, on='gene', how='left').dropna()
dfa['strand'] = dfa['strand_x']
dfa


Out[9]:
TSS end_x start_x gene strand_x end_y start_y strand_y strand
0 148 190 148 thrL + 255.0 190.0 + +
1 148 190 148 thrL + 255.0 190.0 + +
2 5030 5234 5030 yaaX + 5530.0 5234.0 + +
3 6587 6587 6459 yaaA - 6459.0 5683.0 - -
4 6615 6615 6459 yaaA - 6459.0 5683.0 - -
5 8017 8017 7959 yaaJ - 7959.0 6529.0 - -
6 8191 8238 8191 talB + 9191.0 8238.0 + +
9 11542 11542 11356 yaaW - 11356.0 10643.0 - -
10 11825 11825 11786 yaaI - 11786.0 11382.0 - -
11 11913 11913 11786 yaaI - 11786.0 11382.0 - -
12 11938 11938 11786 yaaI - 11786.0 11382.0 - -
13 12048 12163 12048 dnaK + 14079.0 12163.0 + +
14 12123 12163 12123 dnaK + 14079.0 12163.0 + +
15 12144 12163 12144 dnaK + 14079.0 12163.0 + +
18 16951 16951 16903 hokC - 16903.0 16751.0 - -
19 17317 17489 17317 nhaA + 18655.0 17489.0 + +
20 17458 17489 17458 nhaA + 18655.0 17489.0 + +
21 21120 21120 21078 rpsT - 21078.0 20815.0 - -
22 21210 21210 21078 rpsT - 21078.0 20815.0 - -
23 21383 21407 21383 ribF + 22348.0 21407.0 + +
24 21833 22391 21833 ileS + 25207.0 22391.0 + +
25 22034 22391 22034 ileS + 25207.0 22391.0 + +
26 22229 22391 22229 ileS + 25207.0 22391.0 + +
27 25014 25207 25014 lspA + 25701.0 25207.0 + +
28 28288 28374 28288 dapB + 29195.0 28374.0 + +
29 28343 28374 28343 dapB + 29195.0 28374.0 + +
30 29551 29651 29551 carA + 30799.0 29651.0 + +
31 29619 29651 29619 carA + 30799.0 29651.0 + +
32 30775 30817 30775 carB + 34038.0 30817.0 + +
33 34218 34300 34218 caiF + 34695.0 34300.0 + +
... ... ... ... ... ... ... ... ... ...
3754 4609344 4609414 4609344 prfC + 4611003.0 4609414.0 + +
3755 4609356 4609414 4609356 prfC + 4611003.0 4609414.0 + +
3756 4611153 4611396 4611153 osmY + 4612001.0 4611396.0 + +
3757 4616679 4617323 4616679 deoC + 4618102.0 4617323.0 + +
3758 4617278 4617323 4617278 deoC + 4618102.0 4617323.0 + +
3759 4619567 4619603 4619567 deoB + 4620826.0 4619603.0 + +
3760 4621657 4621769 4621657 yjjJ + 4623100.0 4621769.0 + +
3761 4621716 4621769 4621716 yjjJ + 4623100.0 4621769.0 + +
3762 4624238 4624238 4624117 lplA - 4624117.0 4623101.0 - -
3763 4624799 4624799 4624789 ytjB - 4624789.0 4624145.0 - -
3764 4624856 4624895 4624856 serB + 4625863.0 4624895.0 + +
3765 4630566 4630566 4630522 yjjK - 4630522.0 4628855.0 - -
3766 4630700 4630733 4630700 slt + 4632670.0 4630733.0 + +
3767 4632704 4632760 4632704 trpR + 4633086.0 4632760.0 + +
3768 4633773 4633773 4633745 yjjX - 4633745.0 4633233.0 - -
3769 4633899 4633899 4633745 yjjX - 4633745.0 4633233.0 - -
3770 4635243 4635521 4635243 creA + 4635994.0 4635521.0 + +
3771 4635353 4635353 4635310 rob - 4635310.0 4634441.0 - -
3772 4635477 4635521 4635477 creA + 4635994.0 4635521.0 + +
3773 4638160 4638178 4638160 creD + 4639530.0 4638178.0 + +
3774 4640358 4640402 4640358 yjjY + 4640542.0 4640402.0 + +
3775 4640508 4640508 4640306 arcA - 4640306.0 4639590.0 - -
3776 4640512 4640512 4640306 arcA - 4640306.0 4639590.0 - -
3777 4640535 4640535 4640306 arcA - 4640306.0 4639590.0 - -
3778 4640599 4640599 4640306 arcA - 4640306.0 4639590.0 - -
3779 4640681 4640681 4640306 arcA - 4640306.0 4639590.0 - -
3780 4640688 4640688 4640306 arcA - 4640306.0 4639590.0 - -
3781 4640801 4640801 4640306 arcA - 4640306.0 4639590.0 - -
3782 4640838 4640942 4640838 yjtD + 4641628.0 4640942.0 + +
3783 4640898 4640942 4640898 yjtD + 4641628.0 4640942.0 + +

3651 rows × 9 columns


In [10]:
import pysam


def get_ratio(df, bamfile, reference, win=50, offset=200):

    def _utr_orf_ratio(rec):
        
        def _counts(start, end):
            count = 1
            #start = int(rec[key5])
            #end = int(rec[key3])
            #if end < start:
            #    start, end = end, start
            for read in bam.fetch(reference, start, end):
                count += 1
            return float(count)
        
        if rec['strand'] == '-':
            end_utr = rec['end_x']
            start_utr = end_utr - win
            end_orf = end_utr - offset
            start_orf = end_orf - win
        else:
            start_utr = rec['start_x']
            end_utr = start_utr + win
            start_orf = rec['start_x'] + offset
            end_orf = start_orf + win
            
        return _counts(start_utr, end_utr) / _counts(start_orf, end_orf)

    bam = pysam.AlignmentFile(bamfile, 'rb')
    return df.apply(_utr_orf_ratio, axis=1)




def get_utr_ratio_df(data, barcodes, res_dir='../results', win=50, offset=200):
    '''
    Calculates 5'UTR coverage
    Iterates over files whose name ends with '_sorted.bam' in `res_dir` and contains barcodes
    specified in `barcodes`.
    
    Adds `utr_<barcode>` column to df DataFrame
    '''
    d, _, filenames = next(os.walk(res_dir))
    infiles = [f for f in filenames if f.endswith('_sorted.bam')]
    df = data.copy()
    for barcode in barcodes:
        bamfile = os.path.join(d, [f for f in infiles if barcode in f][0])
        df['ratio_{0}'.format(barcode)] = get_ratio(df,
                                            bamfile, 
                                            'gi|556503834|ref|NC_000913.3|',
                                            win, offset)
    output.put({'data': df, 'winsize': win, 'offset': offset})

In [11]:
import multiprocessing as mp


output = mp.Queue()

offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../results/dfa_mp.offset_{}.win_{}.csv'


def utr_ratio(data, barcodes, res_dir='../results', win=50, offset=200):
    '''
    Calculates 5'UTR coverage
    Iterates over files whose name ends with '_sorted.bam' in `res_dir` and contains barcodes
    specified in `barcodes`.
    
    Adds `utr_<barcode>` column to df DataFrame
    '''
    d, _, filenames = next(os.walk(res_dir))
    infiles = [f for f in filenames if f.endswith('_sorted.bam')]
    df = data.copy()
    for barcode in barcodes:
        bamfile = os.path.join(d, [f for f in infiles if barcode in f][0])
        df['ratio_{0}'.format(barcode)] = get_ratio(df,
                                            bamfile, 
                                            'gi|556503834|ref|NC_000913.3|',
                                            win, offset)
    df.to_csv(output_tpl.format(offset, win))
    output.put({'winsize': win, 'offset': offset, 'filename': output_tpl.format(offset, win)})

    
pool = []

for offset in offsets:
    for winsize in winsizes:
        d = dfa.copy()
        pool.append(mp.Process(target=utr_ratio,
                               args=(d, ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT']),
                                kwargs={'res_dir': '../results/',
                                        'offset': offset,
                                        'win': winsize}))
        
for p in pool:
    p.start()
    
for p in pool:
    p.join()
    
    
results = [output.get() for p in pool]

In [12]:
results


Out[12]:
[{'filename': '../../results/redux/dfa_mp.offset_150.win_50.csv',
  'offset': 150,
  'winsize': 50},
 {'filename': '../../results/redux/dfa_mp.offset_200.win_50.csv',
  'offset': 200,
  'winsize': 50},
 {'filename': '../../results/redux/dfa_mp.offset_300.win_50.csv',
  'offset': 300,
  'winsize': 50},
 {'filename': '../../results/redux/dfa_mp.offset_150.win_80.csv',
  'offset': 150,
  'winsize': 80},
 {'filename': '../../results/redux/dfa_mp.offset_300.win_100.csv',
  'offset': 300,
  'winsize': 100},
 {'filename': '../../results/redux/dfa_mp.offset_300.win_80.csv',
  'offset': 300,
  'winsize': 80},
 {'filename': '../../results/redux/dfa_mp.offset_200.win_80.csv',
  'offset': 200,
  'winsize': 80},
 {'filename': '../../results/redux/dfa_mp.offset_150.win_100.csv',
  'offset': 150,
  'winsize': 100},
 {'filename': '../../results/redux/dfa_mp.offset_200.win_100.csv',
  'offset': 200,
  'winsize': 100},
 {'filename': '../../results/redux/dfa_mp.offset_300.win_200.csv',
  'offset': 300,
  'winsize': 200},
 {'filename': '../../results/redux/dfa_mp.offset_200.win_200.csv',
  'offset': 200,
  'winsize': 200},
 {'filename': '../../results/redux/dfa_mp.offset_150.win_200.csv',
  'offset': 150,
  'winsize': 200}]

In [ ]:
output.get()