In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import os

from ggplot import *

Overview

  • select 5'UTRs longer than 80 nt
  • count reads aligned to these UTRs (pysam)
  • plot utr reads -bcm vs utr reads + bcm
  • select UTRs with increased number of reads upon addition of BCM (clustering?)
  • compare selected UTRs with genes upregulated in the stationary phase as discovered by DESeq2
  • compare selected UTRs with small RNA binding sites (pybedtools?)

Sample table and barcodes


In [3]:
# Sample titles with corresponding barcodes
samples = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
    's17': ['TTAGGC', 'GATCAG'],
    's17+bcm': ['TGACCA', 'TAGCTT'],
    's19': ['CAGATC','GGCTAC'],
    's19+bcm': ['ACTTGA', 'CTTGTA']
}

# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG', 'TGACCA', 'TAGCTT', 'CAGATC','GGCTAC', 'ACTTGA', 'CTTGTA']

Load counts for genes, calculate counts in UTRs longer than 80 nt

Gene counts were obtained using htseq program against the standard NC_000913 .GFF file The was I calculate reads in UTRs here is not strand-specific. So the numbers can be confounded if there is a transcript going in the opposite direction. We can solve this later if needed.


In [4]:
dfm = pd.DataFrame.from_csv('../data/dfm.csv', sep='\t')
dfm


Out[4]:
TSS TU_name coord_3 coord_5 first_gene_3 first_gene_5 gene operon promoter seq_5UTR ... utr_CGATGT utr_GCCAAT utr_TTAGGC utr_GATCAG utr_TGACCA utr_TAGCTT utr_CAGATC utr_GGCTAC utr_ACTTGA utr_CTTGTA
0 5030 yaaX 5234 5030 5530 5234 yaaX yaaX yaaXp4 ATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGCCCGGCTTTTT... ... 351 370 298 448 439 446 137 235 479 450
1 6587 yaaA 6587 6459 6459 5683 yaaA yaaA yaaAp3 ATCCGGATATCGGTCGCCAGCTTTCTCCGGACGCGTGGGATGATGT... ... 163 225 175 315 181 281 206 114 388 305
2 6615 yaaA 6615 6459 6459 5683 yaaA yaaA yaaAp5 GTGCGCCCGGTGTTTGATCCATTGCGTTATCCGGATATCGGTCGCC... ... 172 236 175 322 191 303 207 116 411 329
3 11542 yaaW 11542 11356 11356 10643 yaaW yaaW yaaWp3 GCCTGAATATTCCTTCAGAAATAAAAGAAGGGCAAACCACTGACTG... ... 504 776 12 36 567 1056 12 24 1505 1546
4 11913 yaaI 11913 11786 11786 11382 yaaI yaaI yaaIp3 ATCGCAGCCAAGGCATTCATCAAAAAATTGTAATAAAAAGAAAAGA... ... 304 356 14 23 389 399 8 13 458 440
5 11938 yaaI 11938 11786 11786 11382 yaaI yaaI yaaIp2 GCAATTTTATTCATATAAAGAATGAATCGCAGCCAAGGCATTCATC... ... 340 419 14 24 438 510 8 15 565 528
6 12048 dnaK-tpke11-dnaJ 12163 12048 14079 12163 dnaK dnaK-tpke11-dnaJ dnaKp1 AACCGCAGTGAGTGAGTCTGCAAAAAAATGAAATTGGGCAGTTGAA... ... 2977 6584 4799 2830 6350 6968 2336 6099 15976 14174
7 17317 nhaAR 17489 17317 18655 17489 nhaA nhaAR nhaAp2 GGTCACTCGTGAGCGCTTACAGCCGTCAAAAACGCATCTCACCGCT... ... 792 1358 133 215 1301 1908 74 137 2949 2223
8 21210 rpsT 21210 21078 21078 20815 rpsT rpsT rpsTp1 GCCATCACTACGTAACGAGTGCCGGCACATTAACGGCGCTTATTTG... ... 9302 8966 14726 24465 10912 9380 23735 17554 7045 5706
9 21833 ileS-lspA-fkpB-ispH 22391 21833 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp1 GCTGGCATGGAATACGGCTTCGATATCACCAGTACGCAAACTTTTT... ... 1739 2728 5454 5556 2509 3155 4785 5718 6382 5578
10 22034 ileS-lspA-fkpB-ispH 22391 22034 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp2 GCGAATGTACCGCTGCGCCGTCAGGTTTCCCCGGTGAAAGGGGTTT... ... 1369 2398 4524 4748 2043 2715 4188 5051 5825 5171
11 22229 ileS-lspA-fkpB-ispH 22391 22229 25207 22391 ileS ribF-ileS-lspA-fkpB-ispH ileSp3 GTGCTGCGTAAAAAAATACGCAATGAGCAGCGATTTGCGTCGCTGG... ... 1043 2079 3899 4030 1715 2298 3665 4388 5149 4749
12 25014 lspA-fkpB-ispH 25207 25014 25701 25207 lspA ribF-ileS-lspA-fkpB-ispH lspAp ACGCACCTGCTGATGCTCAGCAGAGCGAAGTACTCAAAGGGCTGAA... ... 1189 1295 1949 2738 1182 1692 2208 3284 2990 2404
13 28288 dapB 28374 28288 29195 28374 dapB dapB dapBp2 TAATTATCAGCGTTTTTGGCTGGCGGCGTAGCGATGCGCTGGTTAC... ... 75 103 122 257 142 138 133 131 194 221
14 29551 carAB 29651 29551 30799 29651 carA carAB carAp1 GTTTGCCAGAAATTCGTCGGTAAGCAGATTTGCATTGATTTACGTC... ... 72 116 61 63 124 141 130 61 206 212
15 34218 caiF 34300 34218 34695 34300 caiF caiF caiFp ATCCACAATTTTAATATGGCCTTGTTTAATTGCTTCAAAACGAGTC... ... 39 119 10 17 102 174 15 14 441 338
16 35499 caiE 35499 35371 35371 34781 caiE caiTABCDE caiEp3 GCGTATCGCTATATTCGCAGCGGCGTGTTGAAACACTATCCATCGG... ... 204 260 5 19 202 426 17 19 908 833
17 42037 caiTABCDE 42037 41931 41931 40417 caiT caiTABCDE caiTp GCCATTAACGCGTCCACGAGGTTAATAATAATTATATTAAATGTTA... ... 22 63 1 3 33 88 1 2 105 144
18 45592 yaaU 45807 45592 47138 45807 yaaU yaaU yaaUp CCCGCAGGTCTGTACAAGAAGCAGGATGACGGCAGTGTGCGCTTCG... ... 320 371 5 29 298 863 2 19 1138 1221
19 47080 kefFC 47246 47080 47776 47246 kefF kefFC kefFp2 GAGGGATGTCACTGGCGCAGACCAGCAATATGACGATCCGCGGGCA... ... 114 188 36 101 102 318 40 47 481 420
20 52034 apaGH 52034 51606 51606 51229 apaG lptD-surA-pdxA-rsmA-apaGH apaGp ATTGCCGACATGCACTTTATGTTGCAAAAAGAGGTGGTGAATCGTC... ... 3292 3141 2270 3686 3032 3883 3699 3152 4924 4449
21 52588 rsmA-apaGH 52588 52430 52430 51609 rsmA lptD-surA-pdxA-rsmA-apaGH rsmAp GGCTTCGGGCGCGGTGTGAACATTACGCTGGGCCTGCCCTTTATTC... ... 1129 1430 905 2211 982 1794 1888 1506 2132 1719
22 57241 lptD 57241 57109 57109 54755 lptD lptD-surA-pdxA-rsmA-apaGH impp2 ATTCAATGCCGTGCCTAACACCACTGACGCTATTCGGACAGGATTA... ... 996 1071 2016 2770 1035 1578 2130 1373 1517 1384
23 57336 lptD-surA-pdxA 57336 57109 57109 54755 lptD lptD-surA-pdxA-rsmA-apaGH impp3 CAGGGCTATCTCCCACAATATAAAGGTGCTTTTACCGTTTTCCGGC... ... 1047 1152 2130 2882 1135 1653 2279 1468 1641 1495
24 57268 djlA 57364 57268 58179 57364 djlA djlA djlAp3 GCGATGTCCCACAATTGACCGCAGCCGGAAAACGGTAAAAGCACCT... ... 103 146 134 139 159 183 205 134 231 247
25 60450 rluA 60450 60346 60346 59687 rluA hepA-rluA rluAp1 ATTGAGAGCAACCGTCAGCAGGTAATGGAAAGCCTGGATCAGGCAG... ... 884 747 544 623 882 816 1444 709 1012 1013
26 70241 araC 70387 70241 71265 70387 araC araC araCp TGGTCCCGCTTTGTTACAGAATGCTTTTAATAAGCGGGGTTACCGG... ... 177 295 252 187 202 353 53 114 614 753
27 73085 thiQ 73085 72927 72927 72229 thiQ sgrR-sroA-tbpA-thiPQ thiQp2 GGTGGCGTTGTTCGGTAACGATGATTTCCGCACCCTGCCGTTTTAT... ... 349 463 114 234 357 547 173 198 698 728
28 79594 leuD 79594 79453 79453 78848 leuD leuLABCD leuDp4 GTGCCTCCACCAGCAACCGTAACTTTGAAGGCCGCCAGGGGCGCGG... ... 71 88 31 67 49 125 62 93 221 181
29 84024 leuLABCD 84024 83708 83708 83622 leuL leuLABCD leuLp2 AAATCAATATCAAAAAAAATCGCAAAACATATAATTCAATACAAAT... ... 94 222 256 294 180 268 79 155 420 334
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1657 4589390 yjiY 4589390 4589302 4589302 4587152 yjiY yjiY yjiYp AGGATAGCGGTCAATTTACCTCCTCAAACGCAACGCAAACCTAGAA... ... 1240 5424 2950 7589 1527 8506 324 18591 3646 12952
1658 4589435 tsr 4589680 4589435 4591335 4589680 tsr tsr tsrp1 AACTGAGTGGTTATTTTAGGGATGTAAGCGGTCAGTTTTGCGGTTG... ... 1981 6099 4244 7442 2310 7596 380 19107 3708 13198
1659 4592860 yjjL 4592860 4592745 4592745 4591384 yjjL yjjL yjjLp7 AAAAATGTTTTTGTGAGCGGTAGTAAAGTCCTAAAACTTTAACCTG... ... 2960 2760 34 143 2371 4532 11 69 6694 6507
1660 4592867 yjjL 4592867 4592745 4592745 4591384 yjjL yjjL yjjLp6 GTCGACTAAAAATGTTTTTGTGAGCGGTAGTAAAGTCCTAAAACTT... ... 3095 2914 34 147 2542 4692 11 72 7034 6783
1661 4592905 yjjL 4592905 4592745 4592745 4591384 yjjL yjjL yjjLp8 AATACCGGGAAATTCCCGCTTACCTATGCTCACAATCAGTCGACTA... ... 4063 3850 40 173 3666 5851 12 85 8908 8482
1662 4593970 yjjM 4593970 4593874 4593874 4592960 yjjM yjjM yjjMp4 ATGAAAAATGGCAGCGGAACGGAAAATCTTTTTTGTGAAAACACAC... ... 2083 1977 32 105 2088 3125 46 117 7863 10366
1663 4598360 yjjA 4598360 4598212 4598212 4597718 yjjA yjjB-dnaTC-yjjA yjjAp1 GGGCGAACGCGTGATGGACCGTATGCGCCTGGGTAACAGTTTGTGG... ... 1336 1759 292 532 1931 2841 436 434 1776 1739
1664 4599638 dnaTC-yjjA 4599638 4599540 4599540 4599001 dnaT yjjB-dnaTC-yjjA dnaTp CGCTTAAGCACACGGATGAGAGACAGCCTCCTCTCCTCCGTGTGTT... ... 2809 2471 35 66 2550 3307 30 40 2253 2253
1665 4599638 dnaTC 4599638 4599540 4599540 4599001 dnaT yjjB-dnaTC-yjjA dnaTp CGCTTAAGCACACGGATGAGAGACAGCCTCCTCTCCTCCGTGTGTT... ... 2809 2471 35 66 2550 3307 30 40 2253 2253
1666 4600200 yjjB-dnaTC-yjjA 4600200 4599973 4599973 4599647 yjjB yjjB-dnaTC-yjjA yjjBp CGCTGGGCGATCGCCAGTCTGCTGACACTGGCTACCTGCGTCGGCG... ... 3408 2771 301 393 3686 4104 332 321 2541 2726
1667 4600200 yjjB-dnaTC 4600200 4599973 4599973 4599647 yjjB yjjB-dnaTC-yjjA yjjBp CGCTGGGCGATCGCCAGTCTGCTGACACTGGCTACCTGCGTCGGCG... ... 3408 2771 301 393 3686 4104 332 321 2541 2726
1668 4601057 yjjP 4601057 4600881 4600881 4600111 yjjP yjjP yjjPp GATAGTTTGTTTGCGGCGAGAGATAATTCGCTTTTTATCACCGAGC... ... 4422 4996 177 362 4721 6884 320 299 4190 6384
1669 4601342 yjjQ-bglJ 4601500 4601342 4602225 4601500 yjjQ yjjQ-bglJ yjjQp ACATGCAGTGGAGTTGTTGTGCAGCAGGAGTATGCTGATATGAAAG... ... 5614 6083 240 529 5453 9098 349 503 5119 5931
1670 4603775 fhuF 4603775 4603686 4603686 4602898 fhuF fhuF fhuFp2 ATCATTTGCAAGCCAGATAAATCCCTTGCTATCGGGTAAACCTATC... ... 3901 5340 37 95 6301 7406 12 71 5391 6479
1671 4604296 leuV 4604296 4604188 4604188 4604102 leuV leuQPV leuVp2 AATTGGTAGACGCGCTAGCTTCAGGTGTTAGTGTTCTTACGGACGT... ... 8529 8317 24 145 7494 11600 23 123 9619 13085
1672 4605891 rsmC 4605891 4605723 4605723 4604692 rsmC rsmC rsmCp1 AGGGCGACGCAGCGACCACTGGGTAATGCCCAGTTGCTGTAACTGC... ... 23971 25280 131 444 28109 37675 120 298 31832 30399
1673 4609176 osmY 4609419 4609176 4610024 4609419 osmY osmY osmYp GTGATGACATTTCTGACGGCGTTAAATACCGTTCAATGCGTAGATA... ... 1131 1066 711 2113 957 1281 1854 1150 1343 1131
1674 4614702 deoCABD 4615346 4614702 4616125 4615346 deoC deoCABD deoCp1 ATACGGTTGCAACAACGCATCCAGTTGCCCCAGGTAGACCGGCATC... ... 749 744 84 108 653 1067 33 62 3473 2084
1675 4619680 yjjJ 4619792 4619680 4621123 4619792 yjjJ yjjJ yjjJp2 GGCTTTTTAGTATCTATTCATTTTTCTCTCCAGCTTGAATATTTTC... ... 1477 1516 3697 5754 1213 2070 2906 5983 2706 3402
1676 4622261 lplA 4622261 4622140 4622140 4621124 lplA ytjB-lplA lplAp2 ACAGGGTAAACGCACCCGCTGGCAGCAATCGCCCTTCCTGTTAACC... ... 304 318 255 159 381 409 109 73 695 685
1677 4631922 yjjX 4631922 4631768 4631768 4631256 yjjX yjjX yjjXp6 GTTGCTCACCTTTGGCGGTCAGCGGGCTGTCAGACTGGCCCTGAAT... ... 472 440 125 85 393 691 55 102 1072 1030
1678 4633266 creABC 4633544 4633266 4634017 4633544 creA creABCD creAp2 GACAGGGGCTGATCCAGATGACCTTCCAGCCAGATTAAAAGGTCGC... ... 1276 1070 344 421 940 1328 277 297 1849 1855
1679 4638531 arcA 4638531 4638329 4638329 4637613 arcA arcA arcAp1 GTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACCAAGC... ... 334 666 46 40 1263 620 14 22 1038 955
1680 4638535 arcA 4638535 4638329 4638329 4637613 arcA arcA arcAp2 AGCCGTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACC... ... 335 674 46 40 1270 630 14 22 1046 966
1681 4638558 arcA 4638558 4638329 4638329 4637613 arcA arcA arcAp3 GCTGTTAAAATGGTTAGGATGACAGCCGTTTTTGACACTGTCGGGT... ... 347 693 46 41 1284 664 14 23 1082 999
1682 4638622 arcA 4638622 4638329 4638329 4637613 arcA arcA arcAp4 ACTTGATATATGTCAACGAAGCGTAGTTTTATTGGGTGTCCGGCCC... ... 376 741 48 41 1337 717 14 26 1186 1078
1683 4638704 arcA 4638704 4638329 4638329 4637613 arcA arcA arcAp5 TAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATTTAGATAC... ... 420 797 54 45 1385 780 15 29 1310 1216
1684 4638711 arcA 4638711 4638329 4638329 4637613 arcA arcA arcAp6 ATGCAACTAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATT... ... 425 800 54 45 1387 780 15 29 1318 1225
1685 4638824 arcA 4638824 4638329 4638329 4637613 arcA arcA arcAp7 CTGTACTAACGGTTGAGTTGTTAAAAAATGCTACATATCCTTCTGT... ... 457 843 56 48 1420 835 15 32 1408 1333
1686 4638861 yjtD 4638965 4638861 4639651 4638965 yjtD yjtD yjtDp8 GGGCTTTTTCTGCGACTTACGTTAAGAATTTGTAAATTCGCACCGC... ... 185 232 23 25 272 268 10 16 362 297

1687 rows × 35 columns

Normalize counts for feature length, log-transform, and take means for replicates

Pseudo-counts (+1) are added during UTR reads counting to make sure we can log-transform the data.


In [5]:
id_vars = ['TSS','TU_name','coord_5','coord_3','gene', 'UTR_length']
value_vars = ['s9','s17','s19','s9+bcm','s17+bcm','s19+bcm']

dfn = dfm.copy()

# Normalize counts by gene and utr length
def norm_orf(barcode, rec):
    return float(rec[barcode] / abs(rec['first_gene_5'] - rec['first_gene_3']))

def norm_utr(barcode, rec):
    return float(rec['utr_{0}'.format(barcode)] / rec['UTR_length'])

for barcode in barcodes:
    dfn['orf_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_orf(barcode, rec), axis=1)
    dfn['utr_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_utr(barcode, rec), axis=1)

    
df = dfn[id_vars].copy()
# Take means across replicates according to the samples dict
for sample, bcs in samples.items():
    df['orf_{0}'.format(sample)] = np.log10(dfn[['orf_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
    df['utr_{0}'.format(sample)] = np.log10(dfn[['utr_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
df


Out[5]:
TSS TU_name coord_5 coord_3 gene UTR_length orf_s9 utr_s9 orf_s19+bcm utr_s19+bcm orf_s9+bcm utr_s9+bcm orf_s17+bcm utr_s17+bcm orf_s19 utr_s19 orf_s17 utr_s17
0 5030 yaaX 5030 5234 yaaX 204 -0.040733 0.278642 0.415199 0.357356 0.224628 0.247275 0.237554 0.336283 -0.123962 -0.040117 -0.068171 0.262079
1 6587 yaaA 6459 6587 yaaA 128 0.040578 0.236199 0.232518 0.432493 -0.027730 0.180592 0.048408 0.256402 -0.005917 0.096910 0.068941 0.281956
2 6615 yaaA 6459 6615 yaaA 156 0.040578 0.161944 0.232518 0.375077 -0.027730 0.116506 0.048408 0.199572 -0.005917 0.015048 0.068941 0.202202
3 11542 yaaW 11356 11542 yaaW 186 -1.158484 -1.052029 0.825338 0.913899 0.384203 0.536667 0.450862 0.639776 -1.074938 -1.014240 -1.112727 -0.889302
4 11913 yaaI 11786 11913 yaaI 127 -1.101231 -0.913472 0.611366 0.548443 0.270702 0.414710 0.369510 0.491693 -1.226170 -1.082614 -0.993598 -0.836632
5 11938 yaaI 11786 11938 yaaI 152 -1.101231 -0.926571 0.611366 0.555747 0.270702 0.397368 0.369510 0.493935 -1.226170 -1.121146 -0.993598 -0.903090
6 12048 dnaK-tpke11-dnaJ 12048 12163 dnaK 115 1.370097 1.332965 1.966770 2.117559 1.539609 1.618775 1.639229 1.762711 1.643435 1.564357 1.360430 1.520740
7 17317 nhaAR 17317 17489 nhaA 172 0.068752 -0.024675 0.958300 1.177100 0.466310 0.795880 0.557119 0.969811 -0.051339 -0.212276 -0.026888 0.005021
8 21210 rpsT 21078 21210 rpsT 132 2.351260 2.320020 1.655317 1.683940 1.922250 1.840087 1.954628 1.885721 2.264445 2.194230 2.231317 2.171582
9 21833 ileS-lspA-fkpB-ispH 21833 22391 ileS 558 0.818969 0.927446 0.933123 1.030067 0.554431 0.602352 0.598032 0.705459 0.949946 0.973649 0.793145 0.994123
10 22034 ileS-lspA-fkpB-ispH 22034 22391 ileS 357 0.818969 1.045520 0.933123 1.187537 0.554431 0.722297 0.598032 0.823726 0.949946 1.111927 0.793145 1.113475
11 22229 ileS-lspA-fkpB-ispH 22229 22391 ileS 162 0.818969 1.318179 0.933123 1.485002 0.554431 0.983888 0.598032 1.092924 0.949946 1.395413 0.793145 1.388673
12 25014 lspA-fkpB-ispH 25014 25207 lspA 193 0.812101 1.125894 0.775358 1.145324 0.489258 0.808564 0.520719 0.871899 0.882614 1.153143 0.703779 1.084308
13 28288 dapB 28288 28374 dapB 86 0.301427 0.226870 0.370426 0.382520 0.025426 0.014892 0.158091 0.211630 0.413118 0.186075 0.468394 0.343111
14 29551 carAB 29551 29651 carA 100 -0.269305 -0.055517 0.217094 0.320146 -0.047738 -0.026872 0.006570 0.122216 -0.120173 -0.019997 -0.376895 -0.207608
15 34218 caiF 34218 34300 caiF 82 -0.647207 -0.474481 0.753263 0.676694 0.069921 -0.016187 0.274684 0.226065 -0.754612 -0.752446 -0.685439 -0.783480
16 35499 caiE 35371 35499 caiE 128 -0.875982 -1.028029 0.465559 0.832559 -0.169335 0.258278 -0.054431 0.389720 -0.948030 -0.851937 -0.925754 -1.028029
17 42037 caiTABCDE 41931 42037 caiT 106 -1.773586 -1.849215 0.077073 0.069863 -0.375986 -0.396917 -0.180126 -0.243550 -1.732968 -1.849215 -1.655081 -1.724276
18 45592 yaaU 45592 45807 yaaU 215 -1.275989 -1.235528 0.372890 0.739259 -0.173083 0.206010 0.008722 0.431364 -1.425208 -1.311249 -1.240517 -1.101990
19 47080 kefFC 47080 47246 kefF 166 -0.520156 -0.464233 0.424789 0.433587 -0.015855 -0.041131 0.153383 0.102111 -0.592337 -0.581619 -0.438719 -0.384418
20 52034 apaGH 51606 52034 apaG 428 0.687531 0.888712 0.845591 1.039405 0.603785 0.875940 0.665081 0.907318 0.682296 0.903280 0.632905 0.842481
21 52588 rsmA-apaGH 52430 52588 rsmA 158 0.721794 1.091823 0.854295 1.085886 0.680933 0.908383 0.711535 0.943732 0.754369 1.031025 0.647533 0.993910
22 57241 lptD 57109 57241 lptD 132 0.741502 1.230739 0.619486 1.040944 0.439668 0.893737 0.495573 0.995535 0.758222 1.122836 0.769439 1.258369
23 57336 lptD-surA-pdxA 57109 57336 lptD 227 0.741502 1.019180 0.619486 0.839320 0.439668 0.685169 0.495573 0.788237 0.758222 0.916628 0.769439 1.042955
24 57268 djlA 57268 57364 djlA 96 0.010789 0.200999 0.388350 0.396127 0.114967 0.112898 0.147837 0.250725 0.090360 0.246898 -0.009699 0.152861
25 60450 rluA 60346 60450 rluA 104 0.466672 0.894657 0.596088 0.988362 0.542086 0.894391 0.554211 0.911874 0.611512 1.014981 0.423407 0.749008
26 70241 araC 70241 70387 araC 146 -0.668183 -0.375478 0.263466 0.670386 -0.069593 0.208559 0.074790 0.278910 -0.524365 -0.242666 -0.055596 0.177082
27 73085 thiQ 72927 73085 thiQ 158 -0.473713 0.113097 0.261314 0.654432 -0.050065 0.409869 0.038100 0.456481 -0.487830 0.069687 -0.499463 0.041892
28 79594 leuD 79453 79594 leuD 141 -0.572240 -0.378367 -0.017206 0.153977 -0.362626 -0.248852 -0.311198 -0.209700 -0.546227 -0.259917 -0.676245 -0.459023
29 84024 leuLABCD 83708 84024 leuL 316 0.425337 -0.275672 0.438414 0.076654 0.150078 -0.301030 0.247345 -0.149439 0.354421 -0.431501 0.684595 -0.060354
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1657 4589390 yjiY 4589302 4589390 yjiY 88 1.239090 1.704170 1.505317 1.974543 1.181578 1.578222 1.325749 1.755918 1.617327 2.031294 1.392771 1.777287
1658 4589435 tsr 4589435 4589680 tsr 245 -0.675973 1.257140 1.408266 1.537845 0.992936 1.217215 1.106440 1.305702 -0.757149 1.599549 -0.400572 1.377470
1659 4592860 yjjL 4592745 4592860 yjjL 115 -0.542237 -0.529219 1.732302 1.758879 1.075871 1.395668 1.164711 1.477310 -0.528015 -0.458638 -0.393890 -0.113755
1660 4592867 yjjL 4592745 4592867 yjjL 122 -0.542237 -0.542292 1.732302 1.753024 1.075871 1.391412 1.164711 1.471989 -0.528015 -0.468312 -0.393890 -0.129711
1661 4592905 yjjL 4592745 4592905 yjjL 160 -0.542237 -0.546109 1.732302 1.735150 1.075871 1.393191 1.164711 1.473350 -0.528015 -0.518378 -0.393890 -0.176770
1662 4593970 yjjM 4593874 4593970 yjjM 96 -0.697310 -0.090177 1.489834 1.977462 1.152512 1.325225 1.316606 1.433786 -0.617538 -0.071114 -0.297245 -0.146581
1663 4598360 yjjA 4598212 4598360 yjjA 148 0.320163 0.413504 1.117177 1.074634 1.190927 1.019369 1.255053 1.207409 0.268406 0.468228 0.272884 0.444636
1664 4599638 dnaTC-yjjA 4599540 4599638 dnaT 98 0.185128 -0.423024 1.254556 1.361535 1.252353 1.430378 1.341084 1.475419 0.096749 -0.447158 0.078980 -0.287935
1665 4599638 dnaTC 4599540 4599638 dnaT 98 0.185128 -0.423024 1.254556 1.361535 1.252353 1.430378 1.341084 1.475419 0.096749 -0.447158 0.078980 -0.287935
1666 4600200 yjjB-dnaTC-yjjA 4599973 4600200 yjjB 227 -0.818612 0.228870 1.151988 1.064507 1.235552 1.133862 1.293640 1.234482 -0.749790 0.157857 -0.615591 0.184304
1667 4600200 yjjB-dnaTC 4599973 4600200 yjjB 227 -0.818612 0.228870 1.151988 1.064507 1.235552 1.133862 1.293640 1.234482 -0.749790 0.157857 -0.615591 0.184304
1668 4601057 yjjP 4600881 4601057 yjjP 176 -0.534308 0.244446 1.491480 1.477697 1.542712 1.427416 1.677174 1.518102 -0.503574 0.245148 -0.339948 0.185046
1669 4601342 yjjQ-bglJ 4601342 4601500 yjjQ 158 -0.925840 0.526028 1.295772 1.543675 1.192664 1.568387 1.335313 1.663206 -0.915855 0.430753 -0.714210 0.386239
1670 4603775 fhuF 4603686 4603775 fhuF 89 0.459404 -0.363929 1.698181 1.824031 1.653873 1.715299 1.749789 1.886522 0.606583 -0.331342 0.487199 -0.129846
1671 4604296 leuV 4604188 4604296 leuV 108 0.798297 -0.277549 1.168963 2.021649 1.233557 1.892043 1.022869 1.946443 0.869982 -0.170101 0.650398 -0.106567
1672 4605891 rsmC 4605723 4605891 rsmC 168 0.498959 0.206054 0.194645 2.267668 0.066826 2.166076 0.061558 2.291781 0.439448 0.094837 0.342958 0.233329
1673 4609176 osmY 4609176 4609419 osmY 243 -0.298882 0.840607 1.111562 0.706763 0.853829 0.655194 0.940426 0.663224 -0.366782 0.791064 -0.292500 0.764228
1674 4614702 deoCABD 4614702 4615346 deoC 644 0.542952 -0.919584 0.928006 0.634925 0.475818 0.064144 0.719282 0.125613 0.622080 -1.132192 1.070690 -0.826615
1675 4619680 yjjJ 4619680 4619792 yjjJ 112 -0.256711 1.588822 0.431037 1.435651 0.108564 1.125859 0.191477 1.166023 -0.165137 1.598605 0.001791 1.625230
1676 4622261 lplA 4622140 4622261 lplA 121 -0.410297 -0.135842 0.361858 0.756064 0.025120 0.409975 0.042130 0.513812 -0.143273 -0.123744 -0.323847 0.233185
1677 4631922 yjjX 4631768 4631922 yjjX 154 0.274582 -0.391641 0.779492 0.834082 0.528148 0.471444 0.552469 0.546479 0.187256 -0.292651 0.281956 -0.166331
1678 4633266 creABC 4633266 4633544 creA 278 -0.010689 0.108015 0.628119 0.823596 0.221665 0.625253 0.290576 0.610568 0.127570 0.013837 0.104735 0.138587
1679 4638531 arcA 4638329 4638531 arcA 202 1.193160 -1.087867 1.401348 0.693126 0.825740 0.393619 0.841006 0.668469 1.216159 -1.050079 1.167144 -0.671883
1680 4638535 arcA 4638329 4638535 arcA 206 1.193160 -1.096383 1.401348 0.688731 0.825740 0.388994 0.841006 0.663856 1.216159 -1.058595 1.167144 -0.680399
1681 4638558 arcA 4638329 4638558 arcA 229 1.193160 -1.104563 1.401348 0.657407 0.825740 0.356168 0.841006 0.628723 1.216159 -1.092664 1.167144 -0.721346
1682 4638622 arcA 4638329 4638622 arcA 293 1.193160 -1.188114 1.401348 0.586979 0.825740 0.280156 0.841006 0.544703 1.216159 -1.165838 1.167144 -0.818508
1683 4638704 arcA 4638329 4638704 arcA 375 1.193160 -1.262277 1.401348 0.527372 0.825740 0.210229 0.841006 0.460397 1.216159 -1.231609 1.167144 -0.879426
1684 4638711 arcA 4638329 4638711 arcA 382 1.193160 -1.259844 1.401348 0.522253 0.825740 0.205043 0.841006 0.452766 1.216159 -1.239641 1.167144 -0.887458
1685 4638824 arcA 4638329 4638824 arcA 495 1.193160 -1.323537 1.401348 0.442274 0.825740 0.118308 0.841006 0.357511 1.216159 -1.323537 1.167144 -0.978602
1686 4638861 yjtD 4638861 4638965 yjtD 104 -0.532049 -0.799549 -0.009925 0.500822 -0.479343 0.302073 -0.367239 0.414330 -0.272843 -0.903090 -0.499864 -0.636822

1687 rows × 18 columns

Plot wild type with vs without BCM

Two clusters are apparent. We are after the UTRs that are upregulated by the addition of BCM (cloud of points in the left part of the plot along y=0 line and in general (significantly) above y=x line).

BTW, the point size is the length of UTR. No (apparent) correlation here.


In [6]:
p = ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', size='UTR_length')) \
        + geom_point(alpha=0.1) \
        + geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
print(p)


<ggplot: (8763937444749)>

In [8]:
p = ggplot(df, aes(x='utr_s9', y='utr_s19', size='UTR_length')) \
        + geom_point(alpha=0.1) \
        + geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
print(p)


<ggplot: (-9223363272917432888)>

Clustering

Now we need a way to split the points the way we want. Let's try a bunch of clustering algorithms from scikit-learn.


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import kneighbors_graph
from sklearn import cluster
from sklearn import mixture

X = df.as_matrix(columns=['utr_s9', 'utr_s9+bcm'])
X = StandardScaler().fit_transform(X)

bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
connectivity = kneighbors_graph(X, n_neighbors=20)
connectivity = 0.05 * (connectivity + connectivity.T)
#distances = euclidean_distances(X)

gmm = mixture.GMM(n_components=2, covariance_type='full')

ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2, batch_size=200)
kmeans = cluster.KMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2, n_neighbors=20, eigen_solver='arpack', affinity='nearest_neighbors')
dbscan = cluster.DBSCAN(eps=.5)
affinity_propagation = cluster.AffinityPropagation(damping=.95, preference=-200)
average_linkage = cluster.AgglomerativeClustering(linkage='average', affinity='cityblock', n_clusters=2, connectivity=connectivity)

for name, alg in [
                    ('MiniBatchKMeans', two_means),
                    ('KMeans', kmeans),
                    ('AffinityPropagation', affinity_propagation),
                    ('MeanShift', ms),
                    ('GMM', gmm),
                    ('SpectralClustering', spectral),
                    ('Ward', ward),
                    ('AgglomerativeClustering', average_linkage),
                    ('DBSCAN', dbscan)
                ]:
    alg.fit(X)
    if hasattr(alg, 'labels_'):
        df['label'] = alg.labels_.astype(np.int)
    else:
        df['label'] = alg.predict(X)
    
    p = ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', color='label')) \
        + geom_point(alpha=0.5) \
        + ggtitle(name) \
        + geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
    print(p)


/home/ilya/.venv/pydata3/lib/python3.4/site-packages/sklearn/neighbors/graph.py:37: DeprecationWarning: The behavior of 'kneighbors_graph' when mode='connectivity' will change in version 0.18. Presently, the nearest neighbor of each sample is the sample itself. Beginning in version 0.18, the default behavior will be to exclude each sample from being its own nearest neighbor. To maintain the current behavior, set include_self=True.
  "behavior, set include_self=True.", DeprecationWarning)
<ggplot: (8763937443690)>
<ggplot: (8763931127223)>
<ggplot: (8763924495480)>
<ggplot: (8763937442119)>
<ggplot: (-9223363272930384440)>
<ggplot: (-9223363272930297831)>
<ggplot: (-9223363272930371019)>
<ggplot: (8763924531895)>
<ggplot: (8763924256765)>

In [ ]:
X = df.as_matrix