In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
from ggplot import *
import os
import sys
/home/ilya/.venv/pydata/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))
In [3]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../results/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
for winsize in winsizes:
df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
df['win'] = winsize
df['offset'] = offset
output.append(df)
dfa = pd.concat(output)
In [4]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
Out[4]:
TSS
end_x
start_x
gene
strand_x
end_y
start_y
strand_y
strand
ratio_ATCACG
ratio_ACAGTG
ratio_CGATGT
ratio_GCCAAT
win
offset
UTR_length
0
148
190
148
thrL
+
255.0
190.0
+
+
3.000000
2.784355
0.911828
3.178117
50
150
42
1
148
190
148
thrL
+
255.0
190.0
+
+
3.000000
2.784355
0.911828
3.178117
50
150
42
2
5030
5234
5030
yaaX
+
5530.0
5234.0
+
+
4.576923
6.983333
1.264901
1.436242
50
150
204
3
6587
6587
6459
yaaA
-
6459.0
5683.0
-
-
0.032028
0.072193
0.567568
0.600000
50
150
128
4
6615
6615
6459
yaaA
-
6459.0
5683.0
-
-
0.034091
0.090379
0.654135
0.582011
50
150
156
5
8017
8017
7959
yaaJ
-
7959.0
6529.0
-
-
0.875000
0.571429
0.885246
1.196262
50
150
58
6
8191
8238
8191
talB
+
9191.0
8238.0
+
+
0.478825
0.513356
0.473950
0.564393
50
150
47
9
11542
11542
11356
yaaW
-
11356.0
10643.0
-
-
0.666667
1.777778
1.327273
1.012658
50
150
186
10
11825
11825
11786
yaaI
-
11786.0
11382.0
-
-
0.500000
2.625000
0.652330
0.474874
50
150
39
11
11913
11913
11786
yaaI
-
11786.0
11382.0
-
-
0.333333
0.555556
1.748148
1.713376
50
150
127
12
11938
11938
11786
yaaI
-
11786.0
11382.0
-
-
0.857143
0.428571
1.100592
1.442623
50
150
152
13
12048
12163
12048
dnaK
+
14079.0
12163.0
+
+
0.252212
0.207481
0.171599
0.301158
50
150
115
14
12123
12163
12123
dnaK
+
14079.0
12163.0
+
+
0.869191
0.539653
0.430504
1.010352
50
150
40
15
12144
12163
12144
dnaK
+
14079.0
12163.0
+
+
0.979294
0.717621
0.513948
1.066012
50
150
19
18
16951
16951
16903
hokC
-
16903.0
16751.0
-
-
0.478261
0.569767
0.599631
0.459902
50
150
48
19
17317
17489
17317
nhaA
+
18655.0
17489.0
+
+
0.052632
0.126904
2.822222
1.647166
50
150
172
20
17458
17489
17458
nhaA
+
18655.0
17489.0
+
+
1.067073
1.989583
0.762238
1.602339
50
150
31
21
21120
21120
21078
rpsT
-
21078.0
20815.0
-
-
0.752518
0.615503
0.493768
0.752228
50
150
42
22
21210
21210
21078
rpsT
-
21078.0
20815.0
-
-
0.278619
0.579581
0.220507
0.358928
50
150
132
23
21383
21407
21383
ribF
+
22348.0
21407.0
+
+
0.922207
1.056693
0.849432
0.966921
50
150
24
24
21833
22391
21833
ileS
+
25207.0
22391.0
+
+
1.352113
1.040936
1.098859
1.163180
50
150
558
25
22034
22391
22034
ileS
+
25207.0
22391.0
+
+
0.528970
0.743542
0.934363
0.388699
50
150
357
26
22229
22391
22229
ileS
+
25207.0
22391.0
+
+
0.418221
0.240061
0.299776
0.510862
50
150
162
27
25014
25207
25014
lspA
+
25701.0
25207.0
+
+
0.850227
0.498730
0.592040
0.854137
50
150
193
28
28288
28374
28288
dapB
+
29195.0
28374.0
+
+
0.544828
1.341176
0.757576
0.496063
50
150
86
29
28343
28374
28343
dapB
+
29195.0
28374.0
+
+
1.752809
1.933333
1.785714
1.243902
50
150
31
30
29551
29651
29551
carA
+
30799.0
29651.0
+
+
0.790000
0.430233
0.240310
0.424419
50
150
100
31
29619
29651
29619
carA
+
30799.0
29651.0
+
+
0.788462
0.435897
0.461957
0.466912
50
150
32
32
30775
30817
30775
carB
+
34038.0
30817.0
+
+
0.513514
0.761194
0.406593
1.136000
50
150
42
33
34218
34300
34218
caiF
+
34695.0
34300.0
+
+
0.764706
1.388889
0.357143
0.403846
50
150
82
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
3754
4609344
4609414
4609344
prfC
+
4611003.0
4609414.0
+
+
1.043222
0.723374
0.815589
1.112554
200
300
70
3755
4609356
4609414
4609356
prfC
+
4611003.0
4609414.0
+
+
1.113715
0.751312
0.856031
1.154138
200
300
58
3756
4611153
4611396
4611153
osmY
+
4612001.0
4611396.0
+
+
1.070175
1.486726
0.866915
0.928707
200
300
243
3757
4616679
4617323
4616679
deoC
+
4618102.0
4617323.0
+
+
1.140625
0.526882
1.102041
0.934307
200
300
644
3758
4617278
4617323
4617278
deoC
+
4618102.0
4617323.0
+
+
1.826840
2.649815
1.678423
2.319249
200
300
45
3759
4619567
4619603
4619567
deoB
+
4620826.0
4619603.0
+
+
0.520743
0.548993
0.379441
0.564190
200
300
36
3760
4621657
4621769
4621657
yjjJ
+
4623100.0
4621769.0
+
+
3.920833
14.337209
1.594747
1.465487
200
300
112
3761
4621716
4621769
4621716
yjjJ
+
4623100.0
4621769.0
+
+
1.156682
2.305970
0.783037
0.875229
200
300
53
3762
4624238
4624238
4624117
lplA
-
4624117.0
4623101.0
-
-
2.214286
1.905263
1.272436
1.488294
200
300
121
3763
4624799
4624799
4624789
ytjB
-
4624789.0
4624145.0
-
-
1.145985
1.015267
0.684332
0.680734
200
300
10
3764
4624856
4624895
4624856
serB
+
4625863.0
4624895.0
+
+
1.471910
2.332288
1.002410
1.592798
200
300
39
3765
4630566
4630566
4630522
yjjK
-
4630522.0
4628855.0
-
-
0.975590
1.301775
0.613567
0.892770
200
300
44
3766
4630700
4630733
4630700
slt
+
4632670.0
4630733.0
+
+
0.843023
0.823171
0.951872
0.894191
200
300
33
3767
4632704
4632760
4632704
trpR
+
4633086.0
4632760.0
+
+
1.302372
1.444231
0.835112
0.975930
200
300
56
3768
4633773
4633773
4633745
yjjX
-
4633745.0
4633233.0
-
-
3.361868
3.631399
1.012745
1.262749
200
300
28
3769
4633899
4633899
4633745
yjjX
-
4633745.0
4633233.0
-
-
3.716738
5.043478
1.026196
1.508820
200
300
154
3770
4635243
4635521
4635243
creA
+
4635994.0
4635521.0
+
+
2.986159
2.272251
1.123288
1.083676
200
300
278
3771
4635353
4635353
4635310
rob
-
4635310.0
4634441.0
-
-
0.902421
0.560804
0.475946
0.853982
200
300
43
3772
4635477
4635521
4635477
creA
+
4635994.0
4635521.0
+
+
0.989286
1.091716
0.517182
0.904128
200
300
44
3773
4638160
4638178
4638160
creD
+
4639530.0
4638178.0
+
+
1.642857
3.800000
1.421053
0.748068
200
300
18
3774
4640358
4640402
4640358
yjjY
+
4640542.0
4640402.0
+
+
13.010830
11.512545
14.250000
7.067416
200
300
44
3775
4640508
4640508
4640306
arcA
-
4640306.0
4639590.0
-
-
1.163365
0.827256
0.823056
1.180585
200
300
202
3776
4640512
4640512
4640306
arcA
-
4640306.0
4639590.0
-
-
1.167142
0.837495
0.831858
1.187599
200
300
206
3777
4640535
4640535
4640306
arcA
-
4640306.0
4639590.0
-
-
1.057403
0.763457
0.804410
1.089905
200
300
229
3778
4640599
4640599
4640306
arcA
-
4640306.0
4639590.0
-
-
0.867294
0.786859
0.757342
0.862293
200
300
293
3779
4640681
4640681
4640306
arcA
-
4640306.0
4639590.0
-
-
0.542907
0.452288
0.399267
0.477665
200
300
375
3780
4640688
4640688
4640306
arcA
-
4640306.0
4639590.0
-
-
0.515849
0.440549
0.386567
0.455797
200
300
382
3781
4640801
4640801
4640306
arcA
-
4640306.0
4639590.0
-
-
0.089461
0.110785
0.126010
0.168638
200
300
495
3782
4640838
4640942
4640838
yjtD
+
4641628.0
4640942.0
+
+
1.639535
0.945946
1.095890
2.051546
200
300
104
3783
4640898
4640942
4640898
yjtD
+
4641628.0
4640942.0
+
+
1.056604
0.763636
0.952381
1.453782
200
300
44
43812 rows × 16 columns
In [5]:
all_utrs = dfa[['UTR_length', 'TSS', 'gene',
'ratio_ATCACG', 'ratio_ACAGTG',
'ratio_CGATGT', 'ratio_GCCAAT',
'win', 'offset']]
long_utrs = all_utrs[(all_utrs['win'] == 80)
& (all_utrs['offset'] == 200)
& (all_utrs['UTR_length'] > 80)
& ((all_utrs['ratio_ATCACG'] + all_utrs['ratio_ACAGTG']) / 2 >= 1.5) ]
short_utrs = all_utrs[(all_utrs['win'] == 80)
& (all_utrs['offset'] == 200)
& (all_utrs['UTR_length'] > 0)
& (all_utrs['UTR_length'] <= 80)]
In [6]:
long_utrs
Out[6]:
UTR_length
TSS
gene
ratio_ATCACG
ratio_ACAGTG
ratio_CGATGT
ratio_GCCAAT
win
offset
2
204
5030
yaaX
4.275862
6.000000
0.820000
1.031963
80
200
33
82
34218
caiF
1.466667
1.736842
0.378641
0.500000
80
200
39
215
45592
yaaU
2.333333
3.500000
3.521127
1.899281
80
200
77
288
102867
ftsQ
1.672087
2.145349
1.531915
1.520833
80
200
84
156
117705
ppdD
6.272727
10.100000
1.447257
1.397459
80
200
88
99
121650
aroP
2.810026
2.224396
0.772128
2.149912
80
200
89
120
121671
aroP
2.761506
2.653791
0.675000
2.147002
80
200
94
206
131466
yacH
0.666667
6.500000
0.873418
1.493827
80
200
95
96
131519
acnB
2.233914
1.190070
0.950076
2.142091
80
200
139
133
177757
yadS
1.693182
2.162500
0.848780
1.017460
80
200
153
186
193335
dxr
63.143939
57.567039
34.624434
34.093023
80
200
154
183
193338
dxr
63.918919
58.315341
34.704545
34.058480
80
200
155
110
193411
dxr
70.729064
85.260870
37.570681
35.712230
80
200
159
122
195555
cdsA
4.146018
2.650794
2.969582
3.536842
80
200
160
116
195561
cdsA
4.328704
2.763736
3.075099
3.582888
80
200
162
902
197026
bamA
2.199029
1.814493
2.207650
1.850806
80
200
166
210
208411
accA
1.810316
3.447090
1.783626
1.486111
80
200
172
132
217135
yaeF
61.875000
22.104167
4.588957
2.217213
80
200
176
286
223485
rrsH
13.245690
7.219617
7.841642
9.765564
80
200
177
178
223593
rrsH
109.071429
77.366071
67.159292
68.062500
80
200
206
93
255809
pepD
1.090257
2.019544
0.875000
1.023328
80
200
245
93
310119
ecpB
2.200000
5.875000
2.991379
2.830189
80
200
258
165
318484
ykgB
2.750000
1.133333
0.790055
0.624190
80
200
259
206
318525
ykgB
1.666667
1.909091
0.913043
0.769634
80
200
276
186
335739
yahE
2.500000
2.750000
0.694690
0.931193
80
200
277
152
345252
yahM
87.200000
14.118421
3.755853
2.867596
80
200
303
131
380973
yaiO
0.263158
3.000000
0.786127
0.694737
80
200
307
122
384738
yaiS
3.666667
1.000000
0.669231
1.819355
80
200
320
214
399547
yaiY
4.833333
2.500000
0.809160
0.830688
80
200
340
82
426055
tgt
4.195616
4.621410
1.570605
2.644444
80
200
...
...
...
...
...
...
...
...
...
...
3471
306
4233539
lysC
3.022222
2.016529
0.924901
1.395522
80
200
3483
180
4256457
dgkA
2.803448
1.698598
0.891986
1.279070
80
200
3504
123
4275164
yjcB
1.549296
1.740088
1.137008
0.955912
80
200
3528
206
4301236
mdtP
2.000000
1.666667
0.634731
0.576923
80
200
3536
102
4326836
yjdM
1.027248
2.497512
0.938776
1.046414
80
200
3540
95
4330407
proP
3.777419
2.614916
2.308852
2.415238
80
200
3550
531
4350565
dcuR
2.173333
2.076923
1.377451
2.159420
80
200
3552
194
4352390
yjdK
4.000000
4.250000
4.494253
2.431193
80
200
3578
81
4376794
sugE
1.532609
2.431193
1.131148
1.456704
80
200
3584
94
4382412
frdA
8.224719
4.714286
1.580838
2.391753
80
200
3590
156
4392736
yjeV
1.443478
1.737991
1.192755
1.091853
80
200
3598
200
4399052
miaA
1.084833
2.060811
1.197774
1.136442
80
200
3602
146
4406044
nsrR
3.232323
5.508333
2.173258
1.807471
80
200
3604
196
4416756
yjfP
1.823529
1.518519
0.956522
1.264957
80
200
3628
85
4442261
msrA
1.575472
1.721311
0.746032
1.287402
80
200
3648
101
4467347
treR
2.018182
1.125000
0.549451
1.164179
80
200
3661
164
4479566
yjgN
21.000000
38.000000
1.043478
0.911950
80
200
3665
82
4483919
valS
1.215054
1.822819
0.866803
1.535545
80
200
3666
90
4483927
valS
1.233333
1.988032
0.991632
1.594037
80
200
3702
427
4539928
nanC
4.000000
2.888889
1.863158
1.618557
80
200
3703
290
4540667
fimB
2.538462
0.734043
0.526718
0.811655
80
200
3706
421
4542694
fimA
5.000000
0.600000
0.629630
0.816949
80
200
3708
118
4551518
uxuA
1.426966
1.586207
1.336283
1.520930
80
200
3726
130
4588995
yjiA
3.257576
2.170157
1.777592
2.028616
80
200
3742
227
4602177
yjjB
1.500000
1.714286
2.501566
2.174910
80
200
3743
227
4602177
yjjB
1.500000
1.714286
2.501566
2.174910
80
200
3749
108
4606273
leuV
28.709748
22.305071
4.761766
1.871959
80
200
3760
112
4621657
yjjJ
4.765432
15.809859
1.843658
1.668675
80
200
3770
278
4635243
creA
5.253165
2.875445
3.691667
1.803324
80
200
3782
104
4640838
yjtD
3.416667
1.435484
1.170732
1.753425
80
200
447 rows × 9 columns
In [7]:
short_utrs
Out[7]:
UTR_length
TSS
gene
ratio_ATCACG
ratio_ACAGTG
ratio_CGATGT
ratio_GCCAAT
win
offset
0
42
148
thrL
2.520732
5.070359
1.262385
3.096360
80
200
1
42
148
thrL
2.520732
5.070359
1.262385
3.096360
80
200
5
58
8017
yaaJ
1.727273
0.531250
0.944444
2.175676
80
200
6
47
8191
talB
0.665059
0.707647
0.740161
0.791085
80
200
10
39
11825
yaaI
1.000000
2.100000
0.559783
0.383459
80
200
14
40
12123
dnaK
0.810839
0.652505
0.512525
0.938957
80
200
15
19
12144
dnaK
0.905255
0.742760
0.588723
1.051353
80
200
18
48
16951
hokC
0.533333
0.545852
0.764202
0.580508
80
200
20
31
17458
nhaA
0.788104
1.322785
0.607143
0.966454
80
200
21
42
21120
rpsT
1.246565
0.828911
0.729745
1.175997
80
200
23
24
21383
ribF
1.215054
1.455535
0.905759
1.371758
80
200
29
31
28343
dapB
1.033898
0.731481
1.011364
0.898551
80
200
31
32
29619
carA
0.802260
0.393519
0.491935
0.507788
80
200
32
42
30775
carB
0.330827
0.662791
0.300725
0.871134
80
200
34
28
35399
caiE
0.500000
0.600000
1.511111
0.980645
80
200
35
69
35440
caiE
0.230769
0.875000
1.655914
1.224359
80
200
38
78
42325
fixA
0.750000
0.250000
0.337209
1.109091
80
200
41
24
49799
folA
1.552036
2.148997
0.355308
0.433635
80
200
42
71
51293
apaH
1.162162
1.146018
1.037147
1.185031
80
200
45
47
57156
lptD
2.341463
2.536632
2.176570
2.242370
80
200
52
23
65803
polB
1.890909
1.492308
1.880795
1.447205
80
200
53
27
70075
araB
1.000000
1.000000
5.000000
9.000000
80
200
55
80
71271
yabI
0.833333
0.637363
0.878431
0.974026
80
200
58
39
77338
sgrR
1.046512
1.030928
0.497175
0.788321
80
200
61
27
83735
leuL
46.666667
54.875000
2.697674
2.983871
80
200
62
27
83735
leuL
46.666667
54.875000
2.697674
2.983871
80
200
66
64
84304
leuO
1.000000
1.272727
0.567901
1.082474
80
200
70
30
85600
ilvI
0.647059
0.361111
1.397059
1.304762
80
200
72
38
89596
mraZ
1.156609
1.069307
1.211009
0.959116
80
200
75
20
91012
ftsL
3.048062
3.515837
3.059226
3.174603
80
200
...
...
...
...
...
...
...
...
...
...
3717
23
4563734
yjiK
0.608696
0.657895
0.827273
0.451613
80
200
3720
46
4571705
yjiS
0.250000
0.076923
1.086957
0.774011
80
200
3721
31
4571720
yjiS
0.200000
0.090909
1.483871
0.962963
80
200
3724
77
4579917
symE
2.513514
3.340426
0.927614
1.638235
80
200
3725
40
4581502
hsdS
0.950000
0.245098
1.310427
1.307027
80
200
3728
68
4591347
yjiY
0.646853
0.627216
0.460072
0.775922
80
200
3731
24
4591633
tsr
0.933333
1.083333
1.040404
1.304954
80
200
3732
15
4594737
yjjL
1.111111
1.500000
0.840832
1.693396
80
200
3738
21
4600210
yjjA
2.500000
3.566667
3.400701
2.101633
80
200
3746
31
4605694
fhuF
1.565217
1.688805
0.986071
1.304343
80
200
3748
27
4605777
yjjZ
0.158025
0.307026
2.212301
1.686863
80
200
3750
31
4606432
leuQ
1.076535
1.402279
2.104099
0.901673
80
200
3751
25
4607725
rsmC
1.677852
1.631858
1.451064
1.636872
80
200
3752
22
4607781
holD
0.920220
1.069501
0.728555
0.888016
80
200
3754
70
4609344
prfC
0.531404
0.509370
0.330834
0.616338
80
200
3755
58
4609356
prfC
0.613364
0.540434
0.371023
0.675325
80
200
3758
45
4617278
deoC
0.487644
0.289855
0.355422
0.737024
80
200
3759
36
4619567
deoB
0.469436
0.628236
0.459067
0.611150
80
200
3761
53
4621716
yjjJ
0.974026
1.263158
0.334211
0.649215
80
200
3763
10
4624799
ytjB
0.828283
0.781609
0.603704
0.821918
80
200
3764
39
4624856
serB
2.815603
4.026144
1.230769
2.281081
80
200
3765
44
4630566
yjjK
1.346298
2.315018
0.702703
1.107946
80
200
3766
33
4630700
slt
0.863248
0.503106
0.649254
0.717262
80
200
3767
56
4632704
trpR
1.522727
1.495522
0.968366
1.058076
80
200
3768
28
4633773
yjjX
5.695312
7.983051
1.144289
2.394521
80
200
3771
43
4635353
rob
0.865867
0.700816
0.478824
0.871409
80
200
3772
44
4635477
creA
1.125828
2.138462
0.378882
0.828704
80
200
3773
18
4638160
creD
2.857143
6.375000
1.546512
1.095930
80
200
3774
44
4640358
yjjY
2.547758
2.093721
3.366534
2.455652
80
200
3783
44
4640898
yjtD
1.900000
0.768293
1.080000
2.679245
80
200
1903 rows × 9 columns
In [8]:
samples_dict = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
}
res = []
for i,sample in enumerate(samples_dict):
df = long_utrs[['UTR_length', 'TSS', 'gene']]
df['loglen'] = np.log10(df['UTR_length'])
dtmp = long_utrs[['ratio_{}'.format(bc) for bc in samples_dict[sample]]]
df['mean_ratio'] = dtmp[['ratio_{}'.format(bc) for bc in samples_dict[sample]]].mean(axis=1)
df['cond'] = sample
res.append(df)
df15 = pd.concat(res)
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [9]:
df15['logratio'] = np.log10(df15['mean_ratio'])
df15
Out[9]:
UTR_length
TSS
gene
loglen
mean_ratio
cond
logratio
2
204
5030
yaaX
2.309630
0.925982
s9+bcm
-0.033398
33
82
34218
caiF
1.913814
0.439320
s9+bcm
-0.357219
39
215
45592
yaaU
2.332438
2.710204
s9+bcm
0.433002
77
288
102867
ftsQ
2.459392
1.526374
s9+bcm
0.183661
84
156
117705
ppdD
2.193125
1.422358
s9+bcm
0.153009
88
99
121650
aroP
1.995635
1.461020
s9+bcm
0.164656
89
120
121671
aroP
2.079181
1.411001
s9+bcm
0.149527
94
206
131466
yacH
2.313867
1.183622
s9+bcm
0.073213
95
96
131519
acnB
1.982271
1.546083
s9+bcm
0.189233
139
133
177757
yadS
2.123852
0.933120
s9+bcm
-0.030062
153
186
193335
dxr
2.269513
34.358729
s9+bcm
1.536037
154
183
193338
dxr
2.262451
34.381512
s9+bcm
1.536325
155
110
193411
dxr
2.041393
36.641455
s9+bcm
1.563973
159
122
195555
cdsA
2.086360
3.253212
s9+bcm
0.512312
160
116
195561
cdsA
2.064458
3.328993
s9+bcm
0.522313
162
902
197026
bamA
2.955207
2.029228
s9+bcm
0.307331
166
210
208411
accA
2.322219
1.634868
s9+bcm
0.213483
172
132
217135
yaeF
2.120574
3.403085
s9+bcm
0.531873
176
286
223485
rrsH
2.456366
8.803603
s9+bcm
0.944660
177
178
223593
rrsH
2.250420
67.610896
s9+bcm
1.830017
206
93
255809
pepD
1.968483
0.949164
s9+bcm
-0.022659
245
93
310119
ecpB
1.968483
2.910784
s9+bcm
0.464010
258
165
318484
ykgB
2.217484
0.707123
s9+bcm
-0.150505
259
206
318525
ykgB
2.313867
0.841338
s9+bcm
-0.075029
276
186
335739
yahE
2.269513
0.812941
s9+bcm
-0.089941
277
152
345252
yahM
2.181844
3.311724
s9+bcm
0.520054
303
131
380973
yaiO
2.117271
0.740432
s9+bcm
-0.130515
307
122
384738
yaiS
2.086360
1.244293
s9+bcm
0.094923
320
214
399547
yaiY
2.330414
0.819924
s9+bcm
-0.086226
340
82
426055
tgt
1.913814
2.107525
s9+bcm
0.323773
...
...
...
...
...
...
...
...
3471
306
4233539
lysC
2.485721
2.519376
s9
0.401293
3483
180
4256457
dgkA
2.255273
2.251023
s9
0.352380
3504
123
4275164
yjcB
2.089905
1.644692
s9
0.216085
3528
206
4301236
mdtP
2.313867
1.833333
s9
0.263241
3536
102
4326836
yjdM
2.008600
1.762380
s9
0.246100
3540
95
4330407
proP
1.977724
3.196168
s9
0.504630
3550
531
4350565
dcuR
2.725095
2.125128
s9
0.327385
3552
194
4352390
yjdK
2.287802
4.125000
s9
0.615424
3578
81
4376794
sugE
1.908485
1.981901
s9
0.297082
3584
94
4382412
frdA
1.973128
6.469502
s9
0.810871
3590
156
4392736
yjeV
2.193125
1.590735
s9
0.201598
3598
200
4399052
miaA
2.301030
1.572822
s9
0.196680
3602
146
4406044
nsrR
2.164353
4.370328
s9
0.640514
3604
196
4416756
yjfP
2.292256
1.671024
s9
0.222983
3628
85
4442261
msrA
1.929419
1.648392
s9
0.217060
3648
101
4467347
treR
2.004321
1.571591
s9
0.196340
3661
164
4479566
yjgN
2.214844
29.500000
s9
1.469822
3665
82
4483919
valS
1.913814
1.518936
s9
0.181540
3666
90
4483927
valS
1.954243
1.610683
s9
0.207010
3702
427
4539928
nanC
2.630428
3.444444
s9
0.537119
3703
290
4540667
fimB
2.462398
1.636252
s9
0.213850
3706
421
4542694
fimA
2.624282
2.800000
s9
0.447158
3708
118
4551518
uxuA
2.071882
1.506587
s9
0.177994
3726
130
4588995
yjiA
2.113943
2.713866
s9
0.433588
3742
227
4602177
yjjB
2.356026
1.607143
s9
0.206054
3743
227
4602177
yjjB
2.356026
1.607143
s9
0.206054
3749
108
4606273
leuV
2.033424
25.507410
s9
1.406666
3760
112
4621657
yjjJ
2.049218
10.287646
s9
1.012316
3770
278
4635243
creA
2.444045
4.064305
s9
0.608986
3782
104
4640838
yjtD
2.017033
2.426075
s9
0.384904
894 rows × 7 columns
In [10]:
df15.loc[df15.cond == 's9', 'cond'] = '-bcm'
df15.loc[df15.cond == 's9+bcm', 'cond'] = '+bcm'
#ldf15 = df15[df15['UTR_length'] > 80]
In [12]:
def mark_rho(rec):
if rec['gene'] == 'rpoS' and rec['UTR_length'] > 500:
return 'rpoS'
else:
return ''
df15['label'] = df15.apply(mark_rho, axis=1)
p = ggplot(df15[df15['UTR_length'] < 650], aes(x='UTR_length', y='logratio', color='cond', label='label')) \
+ geom_point(alpha=0.25) \
+ geom_text(color="black", nudge_x=30, size=18) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=28),
axis_text=element_text(size=24))
print(p)
<ggplot: (-9223363288333652245)>
In [17]:
df15.to_csv('../results/long_utrs.df15.csv')
In [13]:
res = []
for i,sample in enumerate(samples_dict):
df = short_utrs[['UTR_length', 'TSS', 'gene']]
df['loglen'] = np.log10(df['UTR_length'])
dtmp = short_utrs[['ratio_{}'.format(bc) for bc in samples_dict[sample]]]
df['mean_ratio'] = dtmp[['ratio_{}'.format(bc) for bc in samples_dict[sample]]].mean(axis=1)
df['cond'] = sample
res.append(df)
sdf15 = pd.concat(res)
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [14]:
sdf15['logratio'] = np.log10(sdf15['mean_ratio'])
sdf15.loc[sdf15.cond == 's9', 'cond'] = '-bcm'
sdf15.loc[sdf15.cond == 's9+bcm', 'cond'] = '+bcm'
sdf15
Out[14]:
UTR_length
TSS
gene
loglen
mean_ratio
cond
logratio
0
42
148
thrL
1.623249
2.179373
+bcm
0.338331
1
42
148
thrL
1.623249
2.179373
+bcm
0.338331
5
58
8017
yaaJ
1.763428
1.560060
+bcm
0.193141
6
47
8191
talB
1.672098
0.765623
+bcm
-0.115985
10
39
11825
yaaI
1.591065
0.471621
+bcm
-0.326407
14
40
12123
dnaK
1.602060
0.725741
+bcm
-0.139218
15
19
12144
dnaK
1.278754
0.820038
+bcm
-0.086166
18
48
16951
hokC
1.681241
0.672355
+bcm
-0.172401
20
31
17458
nhaA
1.491362
0.786798
+bcm
-0.104137
21
42
21120
rpsT
1.623249
0.952871
+bcm
-0.020966
23
24
21383
ribF
1.380211
1.138759
+bcm
0.056432
29
31
28343
dapB
1.491362
0.954957
+bcm
-0.020016
31
32
29619
carA
1.505150
0.499862
+bcm
-0.301150
32
42
30775
carB
1.623249
0.585929
+bcm
-0.232155
34
28
35399
caiE
1.447158
1.245878
+bcm
0.095476
35
69
35440
caiE
1.838849
1.440136
+bcm
0.158404
38
78
42325
fixA
1.892095
0.723150
+bcm
-0.140772
41
24
49799
folA
1.380211
0.394472
+bcm
-0.403984
42
71
51293
apaH
1.851258
1.111089
+bcm
0.045749
45
47
57156
lptD
1.672098
2.209470
+bcm
0.344288
52
23
65803
polB
1.361728
1.664000
+bcm
0.221153
53
27
70075
araB
1.431364
7.000000
+bcm
0.845098
55
80
71271
yabI
1.903090
0.926229
+bcm
-0.033282
58
39
77338
sgrR
1.591065
0.642748
+bcm
-0.191959
61
27
83735
leuL
1.431364
2.840773
+bcm
0.453436
62
27
83735
leuL
1.431364
2.840773
+bcm
0.453436
66
64
84304
leuO
1.806180
0.825188
+bcm
-0.083447
70
30
85600
ilvI
1.477121
1.350910
+bcm
0.130627
72
38
89596
mraZ
1.579784
1.085063
+bcm
0.035455
75
20
91012
ftsL
1.301030
3.116914
+bcm
0.493725
...
...
...
...
...
...
...
...
3717
23
4563734
yjiK
1.361728
0.633295
-bcm
-0.198394
3720
46
4571705
yjiS
1.662758
0.163462
-bcm
-0.786584
3721
31
4571720
yjiS
1.491362
0.145455
-bcm
-0.837273
3724
77
4579917
symE
1.886491
2.926970
-bcm
0.466418
3725
40
4581502
hsdS
1.602060
0.597549
-bcm
-0.223626
3728
68
4591347
yjiY
1.832509
0.637035
-bcm
-0.195837
3731
24
4591633
tsr
1.380211
1.008333
-bcm
0.003604
3732
15
4594737
yjjL
1.176091
1.305556
-bcm
0.115795
3738
21
4600210
yjjA
1.322219
3.033333
-bcm
0.481920
3746
31
4605694
fhuF
1.491362
1.627011
-bcm
0.211390
3748
27
4605777
yjjZ
1.431364
0.232525
-bcm
-0.633530
3750
31
4606432
leuQ
1.491362
1.239407
-bcm
0.093214
3751
25
4607725
rsmC
1.397940
1.654855
-bcm
0.218760
3752
22
4607781
holD
1.342423
0.994861
-bcm
-0.002238
3754
70
4609344
prfC
1.845098
0.520387
-bcm
-0.283674
3755
58
4609356
prfC
1.763428
0.576899
-bcm
-0.238900
3758
45
4617278
deoC
1.653213
0.388750
-bcm
-0.410330
3759
36
4619567
deoB
1.556303
0.548836
-bcm
-0.260558
3761
53
4621716
yjjJ
1.724276
1.118592
-bcm
0.048672
3763
10
4624799
ytjB
1.000000
0.804946
-bcm
-0.094233
3764
39
4624856
serB
1.591065
3.420873
-bcm
0.534137
3765
44
4630566
yjjK
1.643453
1.830658
-bcm
0.262607
3766
33
4630700
slt
1.518514
0.683177
-bcm
-0.165467
3767
56
4632704
trpR
1.748188
1.509125
-bcm
0.178725
3768
28
4633773
yjjX
1.447158
6.839182
-bcm
0.835004
3771
43
4635353
rob
1.633468
0.783341
-bcm
-0.106049
3772
44
4635477
creA
1.643453
1.632145
-bcm
0.212759
3773
18
4638160
creD
1.255273
4.616071
-bcm
0.664273
3774
44
4640358
yjjY
1.643453
2.320739
-bcm
0.365626
3783
44
4640898
yjtD
1.643453
1.334146
-bcm
0.125203
3806 rows × 7 columns
In [26]:
p = ggplot(sdf15, aes(x='UTR_length', y='logratio', color='cond')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=28),
axis_text=element_text(size=24))
print(p)
<ggplot: (-9223363288220070270)>
In [ ]:
Content source: eco32i/rpoS
Similar notebooks: