In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import os
from ggplot import *
In [3]:
# Sample titles with corresponding barcodes
samples = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
's17': ['TTAGGC', 'GATCAG'],
's17+bcm': ['TGACCA', 'TAGCTT'],
's19': ['CAGATC','GGCTAC'],
's19+bcm': ['ACTTGA', 'CTTGTA']
}
# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG', 'TGACCA', 'TAGCTT', 'CAGATC','GGCTAC', 'ACTTGA', 'CTTGTA']
Gene counts were obtained using htseq
program against the standard NC_000913 .GFF file The was I calculate reads in UTRs here is not strand-specific. So the numbers can be confounded if there is a transcript going in the opposite direction. We can solve this later if needed.
In [4]:
dfm = pd.DataFrame.from_csv('../data/dfm.csv', sep='\t')
dfm
Out[4]:
TSS
TU_name
coord_3
coord_5
first_gene_3
first_gene_5
gene
operon
promoter
seq_5UTR
...
utr_CGATGT
utr_GCCAAT
utr_TTAGGC
utr_GATCAG
utr_TGACCA
utr_TAGCTT
utr_CAGATC
utr_GGCTAC
utr_ACTTGA
utr_CTTGTA
0
5030
yaaX
5234
5030
5530
5234
yaaX
yaaX
yaaXp4
ATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGCCCGGCTTTTT...
...
351
370
298
448
439
446
137
235
479
450
1
6587
yaaA
6587
6459
6459
5683
yaaA
yaaA
yaaAp3
ATCCGGATATCGGTCGCCAGCTTTCTCCGGACGCGTGGGATGATGT...
...
163
225
175
315
181
281
206
114
388
305
2
6615
yaaA
6615
6459
6459
5683
yaaA
yaaA
yaaAp5
GTGCGCCCGGTGTTTGATCCATTGCGTTATCCGGATATCGGTCGCC...
...
172
236
175
322
191
303
207
116
411
329
3
11542
yaaW
11542
11356
11356
10643
yaaW
yaaW
yaaWp3
GCCTGAATATTCCTTCAGAAATAAAAGAAGGGCAAACCACTGACTG...
...
504
776
12
36
567
1056
12
24
1505
1546
4
11913
yaaI
11913
11786
11786
11382
yaaI
yaaI
yaaIp3
ATCGCAGCCAAGGCATTCATCAAAAAATTGTAATAAAAAGAAAAGA...
...
304
356
14
23
389
399
8
13
458
440
5
11938
yaaI
11938
11786
11786
11382
yaaI
yaaI
yaaIp2
GCAATTTTATTCATATAAAGAATGAATCGCAGCCAAGGCATTCATC...
...
340
419
14
24
438
510
8
15
565
528
6
12048
dnaK-tpke11-dnaJ
12163
12048
14079
12163
dnaK
dnaK-tpke11-dnaJ
dnaKp1
AACCGCAGTGAGTGAGTCTGCAAAAAAATGAAATTGGGCAGTTGAA...
...
2977
6584
4799
2830
6350
6968
2336
6099
15976
14174
7
17317
nhaAR
17489
17317
18655
17489
nhaA
nhaAR
nhaAp2
GGTCACTCGTGAGCGCTTACAGCCGTCAAAAACGCATCTCACCGCT...
...
792
1358
133
215
1301
1908
74
137
2949
2223
8
21210
rpsT
21210
21078
21078
20815
rpsT
rpsT
rpsTp1
GCCATCACTACGTAACGAGTGCCGGCACATTAACGGCGCTTATTTG...
...
9302
8966
14726
24465
10912
9380
23735
17554
7045
5706
9
21833
ileS-lspA-fkpB-ispH
22391
21833
25207
22391
ileS
ribF-ileS-lspA-fkpB-ispH
ileSp1
GCTGGCATGGAATACGGCTTCGATATCACCAGTACGCAAACTTTTT...
...
1739
2728
5454
5556
2509
3155
4785
5718
6382
5578
10
22034
ileS-lspA-fkpB-ispH
22391
22034
25207
22391
ileS
ribF-ileS-lspA-fkpB-ispH
ileSp2
GCGAATGTACCGCTGCGCCGTCAGGTTTCCCCGGTGAAAGGGGTTT...
...
1369
2398
4524
4748
2043
2715
4188
5051
5825
5171
11
22229
ileS-lspA-fkpB-ispH
22391
22229
25207
22391
ileS
ribF-ileS-lspA-fkpB-ispH
ileSp3
GTGCTGCGTAAAAAAATACGCAATGAGCAGCGATTTGCGTCGCTGG...
...
1043
2079
3899
4030
1715
2298
3665
4388
5149
4749
12
25014
lspA-fkpB-ispH
25207
25014
25701
25207
lspA
ribF-ileS-lspA-fkpB-ispH
lspAp
ACGCACCTGCTGATGCTCAGCAGAGCGAAGTACTCAAAGGGCTGAA...
...
1189
1295
1949
2738
1182
1692
2208
3284
2990
2404
13
28288
dapB
28374
28288
29195
28374
dapB
dapB
dapBp2
TAATTATCAGCGTTTTTGGCTGGCGGCGTAGCGATGCGCTGGTTAC...
...
75
103
122
257
142
138
133
131
194
221
14
29551
carAB
29651
29551
30799
29651
carA
carAB
carAp1
GTTTGCCAGAAATTCGTCGGTAAGCAGATTTGCATTGATTTACGTC...
...
72
116
61
63
124
141
130
61
206
212
15
34218
caiF
34300
34218
34695
34300
caiF
caiF
caiFp
ATCCACAATTTTAATATGGCCTTGTTTAATTGCTTCAAAACGAGTC...
...
39
119
10
17
102
174
15
14
441
338
16
35499
caiE
35499
35371
35371
34781
caiE
caiTABCDE
caiEp3
GCGTATCGCTATATTCGCAGCGGCGTGTTGAAACACTATCCATCGG...
...
204
260
5
19
202
426
17
19
908
833
17
42037
caiTABCDE
42037
41931
41931
40417
caiT
caiTABCDE
caiTp
GCCATTAACGCGTCCACGAGGTTAATAATAATTATATTAAATGTTA...
...
22
63
1
3
33
88
1
2
105
144
18
45592
yaaU
45807
45592
47138
45807
yaaU
yaaU
yaaUp
CCCGCAGGTCTGTACAAGAAGCAGGATGACGGCAGTGTGCGCTTCG...
...
320
371
5
29
298
863
2
19
1138
1221
19
47080
kefFC
47246
47080
47776
47246
kefF
kefFC
kefFp2
GAGGGATGTCACTGGCGCAGACCAGCAATATGACGATCCGCGGGCA...
...
114
188
36
101
102
318
40
47
481
420
20
52034
apaGH
52034
51606
51606
51229
apaG
lptD-surA-pdxA-rsmA-apaGH
apaGp
ATTGCCGACATGCACTTTATGTTGCAAAAAGAGGTGGTGAATCGTC...
...
3292
3141
2270
3686
3032
3883
3699
3152
4924
4449
21
52588
rsmA-apaGH
52588
52430
52430
51609
rsmA
lptD-surA-pdxA-rsmA-apaGH
rsmAp
GGCTTCGGGCGCGGTGTGAACATTACGCTGGGCCTGCCCTTTATTC...
...
1129
1430
905
2211
982
1794
1888
1506
2132
1719
22
57241
lptD
57241
57109
57109
54755
lptD
lptD-surA-pdxA-rsmA-apaGH
impp2
ATTCAATGCCGTGCCTAACACCACTGACGCTATTCGGACAGGATTA...
...
996
1071
2016
2770
1035
1578
2130
1373
1517
1384
23
57336
lptD-surA-pdxA
57336
57109
57109
54755
lptD
lptD-surA-pdxA-rsmA-apaGH
impp3
CAGGGCTATCTCCCACAATATAAAGGTGCTTTTACCGTTTTCCGGC...
...
1047
1152
2130
2882
1135
1653
2279
1468
1641
1495
24
57268
djlA
57364
57268
58179
57364
djlA
djlA
djlAp3
GCGATGTCCCACAATTGACCGCAGCCGGAAAACGGTAAAAGCACCT...
...
103
146
134
139
159
183
205
134
231
247
25
60450
rluA
60450
60346
60346
59687
rluA
hepA-rluA
rluAp1
ATTGAGAGCAACCGTCAGCAGGTAATGGAAAGCCTGGATCAGGCAG...
...
884
747
544
623
882
816
1444
709
1012
1013
26
70241
araC
70387
70241
71265
70387
araC
araC
araCp
TGGTCCCGCTTTGTTACAGAATGCTTTTAATAAGCGGGGTTACCGG...
...
177
295
252
187
202
353
53
114
614
753
27
73085
thiQ
73085
72927
72927
72229
thiQ
sgrR-sroA-tbpA-thiPQ
thiQp2
GGTGGCGTTGTTCGGTAACGATGATTTCCGCACCCTGCCGTTTTAT...
...
349
463
114
234
357
547
173
198
698
728
28
79594
leuD
79594
79453
79453
78848
leuD
leuLABCD
leuDp4
GTGCCTCCACCAGCAACCGTAACTTTGAAGGCCGCCAGGGGCGCGG...
...
71
88
31
67
49
125
62
93
221
181
29
84024
leuLABCD
84024
83708
83708
83622
leuL
leuLABCD
leuLp2
AAATCAATATCAAAAAAAATCGCAAAACATATAATTCAATACAAAT...
...
94
222
256
294
180
268
79
155
420
334
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1657
4589390
yjiY
4589390
4589302
4589302
4587152
yjiY
yjiY
yjiYp
AGGATAGCGGTCAATTTACCTCCTCAAACGCAACGCAAACCTAGAA...
...
1240
5424
2950
7589
1527
8506
324
18591
3646
12952
1658
4589435
tsr
4589680
4589435
4591335
4589680
tsr
tsr
tsrp1
AACTGAGTGGTTATTTTAGGGATGTAAGCGGTCAGTTTTGCGGTTG...
...
1981
6099
4244
7442
2310
7596
380
19107
3708
13198
1659
4592860
yjjL
4592860
4592745
4592745
4591384
yjjL
yjjL
yjjLp7
AAAAATGTTTTTGTGAGCGGTAGTAAAGTCCTAAAACTTTAACCTG...
...
2960
2760
34
143
2371
4532
11
69
6694
6507
1660
4592867
yjjL
4592867
4592745
4592745
4591384
yjjL
yjjL
yjjLp6
GTCGACTAAAAATGTTTTTGTGAGCGGTAGTAAAGTCCTAAAACTT...
...
3095
2914
34
147
2542
4692
11
72
7034
6783
1661
4592905
yjjL
4592905
4592745
4592745
4591384
yjjL
yjjL
yjjLp8
AATACCGGGAAATTCCCGCTTACCTATGCTCACAATCAGTCGACTA...
...
4063
3850
40
173
3666
5851
12
85
8908
8482
1662
4593970
yjjM
4593970
4593874
4593874
4592960
yjjM
yjjM
yjjMp4
ATGAAAAATGGCAGCGGAACGGAAAATCTTTTTTGTGAAAACACAC...
...
2083
1977
32
105
2088
3125
46
117
7863
10366
1663
4598360
yjjA
4598360
4598212
4598212
4597718
yjjA
yjjB-dnaTC-yjjA
yjjAp1
GGGCGAACGCGTGATGGACCGTATGCGCCTGGGTAACAGTTTGTGG...
...
1336
1759
292
532
1931
2841
436
434
1776
1739
1664
4599638
dnaTC-yjjA
4599638
4599540
4599540
4599001
dnaT
yjjB-dnaTC-yjjA
dnaTp
CGCTTAAGCACACGGATGAGAGACAGCCTCCTCTCCTCCGTGTGTT...
...
2809
2471
35
66
2550
3307
30
40
2253
2253
1665
4599638
dnaTC
4599638
4599540
4599540
4599001
dnaT
yjjB-dnaTC-yjjA
dnaTp
CGCTTAAGCACACGGATGAGAGACAGCCTCCTCTCCTCCGTGTGTT...
...
2809
2471
35
66
2550
3307
30
40
2253
2253
1666
4600200
yjjB-dnaTC-yjjA
4600200
4599973
4599973
4599647
yjjB
yjjB-dnaTC-yjjA
yjjBp
CGCTGGGCGATCGCCAGTCTGCTGACACTGGCTACCTGCGTCGGCG...
...
3408
2771
301
393
3686
4104
332
321
2541
2726
1667
4600200
yjjB-dnaTC
4600200
4599973
4599973
4599647
yjjB
yjjB-dnaTC-yjjA
yjjBp
CGCTGGGCGATCGCCAGTCTGCTGACACTGGCTACCTGCGTCGGCG...
...
3408
2771
301
393
3686
4104
332
321
2541
2726
1668
4601057
yjjP
4601057
4600881
4600881
4600111
yjjP
yjjP
yjjPp
GATAGTTTGTTTGCGGCGAGAGATAATTCGCTTTTTATCACCGAGC...
...
4422
4996
177
362
4721
6884
320
299
4190
6384
1669
4601342
yjjQ-bglJ
4601500
4601342
4602225
4601500
yjjQ
yjjQ-bglJ
yjjQp
ACATGCAGTGGAGTTGTTGTGCAGCAGGAGTATGCTGATATGAAAG...
...
5614
6083
240
529
5453
9098
349
503
5119
5931
1670
4603775
fhuF
4603775
4603686
4603686
4602898
fhuF
fhuF
fhuFp2
ATCATTTGCAAGCCAGATAAATCCCTTGCTATCGGGTAAACCTATC...
...
3901
5340
37
95
6301
7406
12
71
5391
6479
1671
4604296
leuV
4604296
4604188
4604188
4604102
leuV
leuQPV
leuVp2
AATTGGTAGACGCGCTAGCTTCAGGTGTTAGTGTTCTTACGGACGT...
...
8529
8317
24
145
7494
11600
23
123
9619
13085
1672
4605891
rsmC
4605891
4605723
4605723
4604692
rsmC
rsmC
rsmCp1
AGGGCGACGCAGCGACCACTGGGTAATGCCCAGTTGCTGTAACTGC...
...
23971
25280
131
444
28109
37675
120
298
31832
30399
1673
4609176
osmY
4609419
4609176
4610024
4609419
osmY
osmY
osmYp
GTGATGACATTTCTGACGGCGTTAAATACCGTTCAATGCGTAGATA...
...
1131
1066
711
2113
957
1281
1854
1150
1343
1131
1674
4614702
deoCABD
4615346
4614702
4616125
4615346
deoC
deoCABD
deoCp1
ATACGGTTGCAACAACGCATCCAGTTGCCCCAGGTAGACCGGCATC...
...
749
744
84
108
653
1067
33
62
3473
2084
1675
4619680
yjjJ
4619792
4619680
4621123
4619792
yjjJ
yjjJ
yjjJp2
GGCTTTTTAGTATCTATTCATTTTTCTCTCCAGCTTGAATATTTTC...
...
1477
1516
3697
5754
1213
2070
2906
5983
2706
3402
1676
4622261
lplA
4622261
4622140
4622140
4621124
lplA
ytjB-lplA
lplAp2
ACAGGGTAAACGCACCCGCTGGCAGCAATCGCCCTTCCTGTTAACC...
...
304
318
255
159
381
409
109
73
695
685
1677
4631922
yjjX
4631922
4631768
4631768
4631256
yjjX
yjjX
yjjXp6
GTTGCTCACCTTTGGCGGTCAGCGGGCTGTCAGACTGGCCCTGAAT...
...
472
440
125
85
393
691
55
102
1072
1030
1678
4633266
creABC
4633544
4633266
4634017
4633544
creA
creABCD
creAp2
GACAGGGGCTGATCCAGATGACCTTCCAGCCAGATTAAAAGGTCGC...
...
1276
1070
344
421
940
1328
277
297
1849
1855
1679
4638531
arcA
4638531
4638329
4638329
4637613
arcA
arcA
arcAp1
GTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACCAAGC...
...
334
666
46
40
1263
620
14
22
1038
955
1680
4638535
arcA
4638535
4638329
4638329
4637613
arcA
arcA
arcAp2
AGCCGTTTTTGACACTGTCGGGTCCTGAGGGAAAGTACCCACGACC...
...
335
674
46
40
1270
630
14
22
1046
966
1681
4638558
arcA
4638558
4638329
4638329
4637613
arcA
arcA
arcAp3
GCTGTTAAAATGGTTAGGATGACAGCCGTTTTTGACACTGTCGGGT...
...
347
693
46
41
1284
664
14
23
1082
999
1682
4638622
arcA
4638622
4638329
4638329
4637613
arcA
arcA
arcAp4
ACTTGATATATGTCAACGAAGCGTAGTTTTATTGGGTGTCCGGCCC...
...
376
741
48
41
1337
717
14
26
1186
1078
1683
4638704
arcA
4638704
4638329
4638329
4637613
arcA
arcA
arcAp5
TAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATTTAGATAC...
...
420
797
54
45
1385
780
15
29
1310
1216
1684
4638711
arcA
4638711
4638329
4638329
4637613
arcA
arcA
arcAp6
ATGCAACTAGTTGGATTATTAAAATAATGTGACGAAAGCTAGCATT...
...
425
800
54
45
1387
780
15
29
1318
1225
1685
4638824
arcA
4638824
4638329
4638329
4637613
arcA
arcA
arcAp7
CTGTACTAACGGTTGAGTTGTTAAAAAATGCTACATATCCTTCTGT...
...
457
843
56
48
1420
835
15
32
1408
1333
1686
4638861
yjtD
4638965
4638861
4639651
4638965
yjtD
yjtD
yjtDp8
GGGCTTTTTCTGCGACTTACGTTAAGAATTTGTAAATTCGCACCGC...
...
185
232
23
25
272
268
10
16
362
297
1687 rows × 35 columns
In [5]:
id_vars = ['TSS','TU_name','coord_5','coord_3','gene', 'UTR_length']
value_vars = ['s9','s17','s19','s9+bcm','s17+bcm','s19+bcm']
dfn = dfm.copy()
# Normalize counts by gene and utr length
def norm_orf(barcode, rec):
return float(rec[barcode] / abs(rec['first_gene_5'] - rec['first_gene_3']))
def norm_utr(barcode, rec):
return float(rec['utr_{0}'.format(barcode)] / rec['UTR_length'])
for barcode in barcodes:
dfn['orf_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_orf(barcode, rec), axis=1)
dfn['utr_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_utr(barcode, rec), axis=1)
df = dfn[id_vars].copy()
# Take means across replicates according to the samples dict
for sample, bcs in samples.items():
df['orf_{0}'.format(sample)] = np.log10(dfn[['orf_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
df['utr_{0}'.format(sample)] = np.log10(dfn[['utr_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
df
Out[5]:
TSS
TU_name
coord_5
coord_3
gene
UTR_length
orf_s9
utr_s9
orf_s19+bcm
utr_s19+bcm
orf_s9+bcm
utr_s9+bcm
orf_s17+bcm
utr_s17+bcm
orf_s19
utr_s19
orf_s17
utr_s17
0
5030
yaaX
5030
5234
yaaX
204
-0.040733
0.278642
0.415199
0.357356
0.224628
0.247275
0.237554
0.336283
-0.123962
-0.040117
-0.068171
0.262079
1
6587
yaaA
6459
6587
yaaA
128
0.040578
0.236199
0.232518
0.432493
-0.027730
0.180592
0.048408
0.256402
-0.005917
0.096910
0.068941
0.281956
2
6615
yaaA
6459
6615
yaaA
156
0.040578
0.161944
0.232518
0.375077
-0.027730
0.116506
0.048408
0.199572
-0.005917
0.015048
0.068941
0.202202
3
11542
yaaW
11356
11542
yaaW
186
-1.158484
-1.052029
0.825338
0.913899
0.384203
0.536667
0.450862
0.639776
-1.074938
-1.014240
-1.112727
-0.889302
4
11913
yaaI
11786
11913
yaaI
127
-1.101231
-0.913472
0.611366
0.548443
0.270702
0.414710
0.369510
0.491693
-1.226170
-1.082614
-0.993598
-0.836632
5
11938
yaaI
11786
11938
yaaI
152
-1.101231
-0.926571
0.611366
0.555747
0.270702
0.397368
0.369510
0.493935
-1.226170
-1.121146
-0.993598
-0.903090
6
12048
dnaK-tpke11-dnaJ
12048
12163
dnaK
115
1.370097
1.332965
1.966770
2.117559
1.539609
1.618775
1.639229
1.762711
1.643435
1.564357
1.360430
1.520740
7
17317
nhaAR
17317
17489
nhaA
172
0.068752
-0.024675
0.958300
1.177100
0.466310
0.795880
0.557119
0.969811
-0.051339
-0.212276
-0.026888
0.005021
8
21210
rpsT
21078
21210
rpsT
132
2.351260
2.320020
1.655317
1.683940
1.922250
1.840087
1.954628
1.885721
2.264445
2.194230
2.231317
2.171582
9
21833
ileS-lspA-fkpB-ispH
21833
22391
ileS
558
0.818969
0.927446
0.933123
1.030067
0.554431
0.602352
0.598032
0.705459
0.949946
0.973649
0.793145
0.994123
10
22034
ileS-lspA-fkpB-ispH
22034
22391
ileS
357
0.818969
1.045520
0.933123
1.187537
0.554431
0.722297
0.598032
0.823726
0.949946
1.111927
0.793145
1.113475
11
22229
ileS-lspA-fkpB-ispH
22229
22391
ileS
162
0.818969
1.318179
0.933123
1.485002
0.554431
0.983888
0.598032
1.092924
0.949946
1.395413
0.793145
1.388673
12
25014
lspA-fkpB-ispH
25014
25207
lspA
193
0.812101
1.125894
0.775358
1.145324
0.489258
0.808564
0.520719
0.871899
0.882614
1.153143
0.703779
1.084308
13
28288
dapB
28288
28374
dapB
86
0.301427
0.226870
0.370426
0.382520
0.025426
0.014892
0.158091
0.211630
0.413118
0.186075
0.468394
0.343111
14
29551
carAB
29551
29651
carA
100
-0.269305
-0.055517
0.217094
0.320146
-0.047738
-0.026872
0.006570
0.122216
-0.120173
-0.019997
-0.376895
-0.207608
15
34218
caiF
34218
34300
caiF
82
-0.647207
-0.474481
0.753263
0.676694
0.069921
-0.016187
0.274684
0.226065
-0.754612
-0.752446
-0.685439
-0.783480
16
35499
caiE
35371
35499
caiE
128
-0.875982
-1.028029
0.465559
0.832559
-0.169335
0.258278
-0.054431
0.389720
-0.948030
-0.851937
-0.925754
-1.028029
17
42037
caiTABCDE
41931
42037
caiT
106
-1.773586
-1.849215
0.077073
0.069863
-0.375986
-0.396917
-0.180126
-0.243550
-1.732968
-1.849215
-1.655081
-1.724276
18
45592
yaaU
45592
45807
yaaU
215
-1.275989
-1.235528
0.372890
0.739259
-0.173083
0.206010
0.008722
0.431364
-1.425208
-1.311249
-1.240517
-1.101990
19
47080
kefFC
47080
47246
kefF
166
-0.520156
-0.464233
0.424789
0.433587
-0.015855
-0.041131
0.153383
0.102111
-0.592337
-0.581619
-0.438719
-0.384418
20
52034
apaGH
51606
52034
apaG
428
0.687531
0.888712
0.845591
1.039405
0.603785
0.875940
0.665081
0.907318
0.682296
0.903280
0.632905
0.842481
21
52588
rsmA-apaGH
52430
52588
rsmA
158
0.721794
1.091823
0.854295
1.085886
0.680933
0.908383
0.711535
0.943732
0.754369
1.031025
0.647533
0.993910
22
57241
lptD
57109
57241
lptD
132
0.741502
1.230739
0.619486
1.040944
0.439668
0.893737
0.495573
0.995535
0.758222
1.122836
0.769439
1.258369
23
57336
lptD-surA-pdxA
57109
57336
lptD
227
0.741502
1.019180
0.619486
0.839320
0.439668
0.685169
0.495573
0.788237
0.758222
0.916628
0.769439
1.042955
24
57268
djlA
57268
57364
djlA
96
0.010789
0.200999
0.388350
0.396127
0.114967
0.112898
0.147837
0.250725
0.090360
0.246898
-0.009699
0.152861
25
60450
rluA
60346
60450
rluA
104
0.466672
0.894657
0.596088
0.988362
0.542086
0.894391
0.554211
0.911874
0.611512
1.014981
0.423407
0.749008
26
70241
araC
70241
70387
araC
146
-0.668183
-0.375478
0.263466
0.670386
-0.069593
0.208559
0.074790
0.278910
-0.524365
-0.242666
-0.055596
0.177082
27
73085
thiQ
72927
73085
thiQ
158
-0.473713
0.113097
0.261314
0.654432
-0.050065
0.409869
0.038100
0.456481
-0.487830
0.069687
-0.499463
0.041892
28
79594
leuD
79453
79594
leuD
141
-0.572240
-0.378367
-0.017206
0.153977
-0.362626
-0.248852
-0.311198
-0.209700
-0.546227
-0.259917
-0.676245
-0.459023
29
84024
leuLABCD
83708
84024
leuL
316
0.425337
-0.275672
0.438414
0.076654
0.150078
-0.301030
0.247345
-0.149439
0.354421
-0.431501
0.684595
-0.060354
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1657
4589390
yjiY
4589302
4589390
yjiY
88
1.239090
1.704170
1.505317
1.974543
1.181578
1.578222
1.325749
1.755918
1.617327
2.031294
1.392771
1.777287
1658
4589435
tsr
4589435
4589680
tsr
245
-0.675973
1.257140
1.408266
1.537845
0.992936
1.217215
1.106440
1.305702
-0.757149
1.599549
-0.400572
1.377470
1659
4592860
yjjL
4592745
4592860
yjjL
115
-0.542237
-0.529219
1.732302
1.758879
1.075871
1.395668
1.164711
1.477310
-0.528015
-0.458638
-0.393890
-0.113755
1660
4592867
yjjL
4592745
4592867
yjjL
122
-0.542237
-0.542292
1.732302
1.753024
1.075871
1.391412
1.164711
1.471989
-0.528015
-0.468312
-0.393890
-0.129711
1661
4592905
yjjL
4592745
4592905
yjjL
160
-0.542237
-0.546109
1.732302
1.735150
1.075871
1.393191
1.164711
1.473350
-0.528015
-0.518378
-0.393890
-0.176770
1662
4593970
yjjM
4593874
4593970
yjjM
96
-0.697310
-0.090177
1.489834
1.977462
1.152512
1.325225
1.316606
1.433786
-0.617538
-0.071114
-0.297245
-0.146581
1663
4598360
yjjA
4598212
4598360
yjjA
148
0.320163
0.413504
1.117177
1.074634
1.190927
1.019369
1.255053
1.207409
0.268406
0.468228
0.272884
0.444636
1664
4599638
dnaTC-yjjA
4599540
4599638
dnaT
98
0.185128
-0.423024
1.254556
1.361535
1.252353
1.430378
1.341084
1.475419
0.096749
-0.447158
0.078980
-0.287935
1665
4599638
dnaTC
4599540
4599638
dnaT
98
0.185128
-0.423024
1.254556
1.361535
1.252353
1.430378
1.341084
1.475419
0.096749
-0.447158
0.078980
-0.287935
1666
4600200
yjjB-dnaTC-yjjA
4599973
4600200
yjjB
227
-0.818612
0.228870
1.151988
1.064507
1.235552
1.133862
1.293640
1.234482
-0.749790
0.157857
-0.615591
0.184304
1667
4600200
yjjB-dnaTC
4599973
4600200
yjjB
227
-0.818612
0.228870
1.151988
1.064507
1.235552
1.133862
1.293640
1.234482
-0.749790
0.157857
-0.615591
0.184304
1668
4601057
yjjP
4600881
4601057
yjjP
176
-0.534308
0.244446
1.491480
1.477697
1.542712
1.427416
1.677174
1.518102
-0.503574
0.245148
-0.339948
0.185046
1669
4601342
yjjQ-bglJ
4601342
4601500
yjjQ
158
-0.925840
0.526028
1.295772
1.543675
1.192664
1.568387
1.335313
1.663206
-0.915855
0.430753
-0.714210
0.386239
1670
4603775
fhuF
4603686
4603775
fhuF
89
0.459404
-0.363929
1.698181
1.824031
1.653873
1.715299
1.749789
1.886522
0.606583
-0.331342
0.487199
-0.129846
1671
4604296
leuV
4604188
4604296
leuV
108
0.798297
-0.277549
1.168963
2.021649
1.233557
1.892043
1.022869
1.946443
0.869982
-0.170101
0.650398
-0.106567
1672
4605891
rsmC
4605723
4605891
rsmC
168
0.498959
0.206054
0.194645
2.267668
0.066826
2.166076
0.061558
2.291781
0.439448
0.094837
0.342958
0.233329
1673
4609176
osmY
4609176
4609419
osmY
243
-0.298882
0.840607
1.111562
0.706763
0.853829
0.655194
0.940426
0.663224
-0.366782
0.791064
-0.292500
0.764228
1674
4614702
deoCABD
4614702
4615346
deoC
644
0.542952
-0.919584
0.928006
0.634925
0.475818
0.064144
0.719282
0.125613
0.622080
-1.132192
1.070690
-0.826615
1675
4619680
yjjJ
4619680
4619792
yjjJ
112
-0.256711
1.588822
0.431037
1.435651
0.108564
1.125859
0.191477
1.166023
-0.165137
1.598605
0.001791
1.625230
1676
4622261
lplA
4622140
4622261
lplA
121
-0.410297
-0.135842
0.361858
0.756064
0.025120
0.409975
0.042130
0.513812
-0.143273
-0.123744
-0.323847
0.233185
1677
4631922
yjjX
4631768
4631922
yjjX
154
0.274582
-0.391641
0.779492
0.834082
0.528148
0.471444
0.552469
0.546479
0.187256
-0.292651
0.281956
-0.166331
1678
4633266
creABC
4633266
4633544
creA
278
-0.010689
0.108015
0.628119
0.823596
0.221665
0.625253
0.290576
0.610568
0.127570
0.013837
0.104735
0.138587
1679
4638531
arcA
4638329
4638531
arcA
202
1.193160
-1.087867
1.401348
0.693126
0.825740
0.393619
0.841006
0.668469
1.216159
-1.050079
1.167144
-0.671883
1680
4638535
arcA
4638329
4638535
arcA
206
1.193160
-1.096383
1.401348
0.688731
0.825740
0.388994
0.841006
0.663856
1.216159
-1.058595
1.167144
-0.680399
1681
4638558
arcA
4638329
4638558
arcA
229
1.193160
-1.104563
1.401348
0.657407
0.825740
0.356168
0.841006
0.628723
1.216159
-1.092664
1.167144
-0.721346
1682
4638622
arcA
4638329
4638622
arcA
293
1.193160
-1.188114
1.401348
0.586979
0.825740
0.280156
0.841006
0.544703
1.216159
-1.165838
1.167144
-0.818508
1683
4638704
arcA
4638329
4638704
arcA
375
1.193160
-1.262277
1.401348
0.527372
0.825740
0.210229
0.841006
0.460397
1.216159
-1.231609
1.167144
-0.879426
1684
4638711
arcA
4638329
4638711
arcA
382
1.193160
-1.259844
1.401348
0.522253
0.825740
0.205043
0.841006
0.452766
1.216159
-1.239641
1.167144
-0.887458
1685
4638824
arcA
4638329
4638824
arcA
495
1.193160
-1.323537
1.401348
0.442274
0.825740
0.118308
0.841006
0.357511
1.216159
-1.323537
1.167144
-0.978602
1686
4638861
yjtD
4638861
4638965
yjtD
104
-0.532049
-0.799549
-0.009925
0.500822
-0.479343
0.302073
-0.367239
0.414330
-0.272843
-0.903090
-0.499864
-0.636822
1687 rows × 18 columns
Two clusters are apparent. We are after the UTRs that are upregulated by the addition of BCM (cloud of points in the left part of the plot along y=0 line and in general (significantly) above y=x line).
BTW, the point size is the length of UTR. No (apparent) correlation here.
In [6]:
p = ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', size='UTR_length')) \
+ geom_point(alpha=0.1) \
+ geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
print(p)
<ggplot: (8763937444749)>
In [8]:
p = ggplot(df, aes(x='utr_s9', y='utr_s19', size='UTR_length')) \
+ geom_point(alpha=0.1) \
+ geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
print(p)
<ggplot: (-9223363272917432888)>
In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import kneighbors_graph
from sklearn import cluster
from sklearn import mixture
X = df.as_matrix(columns=['utr_s9', 'utr_s9+bcm'])
X = StandardScaler().fit_transform(X)
bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
connectivity = kneighbors_graph(X, n_neighbors=20)
connectivity = 0.05 * (connectivity + connectivity.T)
#distances = euclidean_distances(X)
gmm = mixture.GMM(n_components=2, covariance_type='full')
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2, batch_size=200)
kmeans = cluster.KMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2, n_neighbors=20, eigen_solver='arpack', affinity='nearest_neighbors')
dbscan = cluster.DBSCAN(eps=.5)
affinity_propagation = cluster.AffinityPropagation(damping=.95, preference=-200)
average_linkage = cluster.AgglomerativeClustering(linkage='average', affinity='cityblock', n_clusters=2, connectivity=connectivity)
for name, alg in [
('MiniBatchKMeans', two_means),
('KMeans', kmeans),
('AffinityPropagation', affinity_propagation),
('MeanShift', ms),
('GMM', gmm),
('SpectralClustering', spectral),
('Ward', ward),
('AgglomerativeClustering', average_linkage),
('DBSCAN', dbscan)
]:
alg.fit(X)
if hasattr(alg, 'labels_'):
df['label'] = alg.labels_.astype(np.int)
else:
df['label'] = alg.predict(X)
p = ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', color='label')) \
+ geom_point(alpha=0.5) \
+ ggtitle(name) \
+ geom_abline(slope=1, intercept=0, size=2.5, color='#586e75')
print(p)
/home/ilya/.venv/pydata3/lib/python3.4/site-packages/sklearn/neighbors/graph.py:37: DeprecationWarning: The behavior of 'kneighbors_graph' when mode='connectivity' will change in version 0.18. Presently, the nearest neighbor of each sample is the sample itself. Beginning in version 0.18, the default behavior will be to exclude each sample from being its own nearest neighbor. To maintain the current behavior, set include_self=True.
"behavior, set include_self=True.", DeprecationWarning)
<ggplot: (8763937443690)>
<ggplot: (8763931127223)>
<ggplot: (8763924495480)>
<ggplot: (8763937442119)>
<ggplot: (-9223363272930384440)>
<ggplot: (-9223363272930297831)>
<ggplot: (-9223363272930371019)>
<ggplot: (8763924531895)>
<ggplot: (8763924256765)>
In [ ]:
X = df.as_matrix
Content source: eco32i/biodata
Similar notebooks: