In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from ggplot import *

import os
import sys


/home/ilya/.venv/pydata/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [3]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../results/dfa_mp.offset_{}.win_{}.csv'

output = []

for offset in offsets:
    for winsize in winsizes:
        df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
        df['win'] = winsize
        df['offset'] = offset
        output.append(df)
        
dfa = pd.concat(output)

In [4]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa


Out[4]:
TSS end_x start_x gene strand_x end_y start_y strand_y strand ratio_ATCACG ratio_ACAGTG ratio_CGATGT ratio_GCCAAT win offset UTR_length
0 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
1 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
2 5030 5234 5030 yaaX + 5530.0 5234.0 + + 4.576923 6.983333 1.264901 1.436242 50 150 204
3 6587 6587 6459 yaaA - 6459.0 5683.0 - - 0.032028 0.072193 0.567568 0.600000 50 150 128
4 6615 6615 6459 yaaA - 6459.0 5683.0 - - 0.034091 0.090379 0.654135 0.582011 50 150 156
5 8017 8017 7959 yaaJ - 7959.0 6529.0 - - 0.875000 0.571429 0.885246 1.196262 50 150 58
6 8191 8238 8191 talB + 9191.0 8238.0 + + 0.478825 0.513356 0.473950 0.564393 50 150 47
9 11542 11542 11356 yaaW - 11356.0 10643.0 - - 0.666667 1.777778 1.327273 1.012658 50 150 186
10 11825 11825 11786 yaaI - 11786.0 11382.0 - - 0.500000 2.625000 0.652330 0.474874 50 150 39
11 11913 11913 11786 yaaI - 11786.0 11382.0 - - 0.333333 0.555556 1.748148 1.713376 50 150 127
12 11938 11938 11786 yaaI - 11786.0 11382.0 - - 0.857143 0.428571 1.100592 1.442623 50 150 152
13 12048 12163 12048 dnaK + 14079.0 12163.0 + + 0.252212 0.207481 0.171599 0.301158 50 150 115
14 12123 12163 12123 dnaK + 14079.0 12163.0 + + 0.869191 0.539653 0.430504 1.010352 50 150 40
15 12144 12163 12144 dnaK + 14079.0 12163.0 + + 0.979294 0.717621 0.513948 1.066012 50 150 19
18 16951 16951 16903 hokC - 16903.0 16751.0 - - 0.478261 0.569767 0.599631 0.459902 50 150 48
19 17317 17489 17317 nhaA + 18655.0 17489.0 + + 0.052632 0.126904 2.822222 1.647166 50 150 172
20 17458 17489 17458 nhaA + 18655.0 17489.0 + + 1.067073 1.989583 0.762238 1.602339 50 150 31
21 21120 21120 21078 rpsT - 21078.0 20815.0 - - 0.752518 0.615503 0.493768 0.752228 50 150 42
22 21210 21210 21078 rpsT - 21078.0 20815.0 - - 0.278619 0.579581 0.220507 0.358928 50 150 132
23 21383 21407 21383 ribF + 22348.0 21407.0 + + 0.922207 1.056693 0.849432 0.966921 50 150 24
24 21833 22391 21833 ileS + 25207.0 22391.0 + + 1.352113 1.040936 1.098859 1.163180 50 150 558
25 22034 22391 22034 ileS + 25207.0 22391.0 + + 0.528970 0.743542 0.934363 0.388699 50 150 357
26 22229 22391 22229 ileS + 25207.0 22391.0 + + 0.418221 0.240061 0.299776 0.510862 50 150 162
27 25014 25207 25014 lspA + 25701.0 25207.0 + + 0.850227 0.498730 0.592040 0.854137 50 150 193
28 28288 28374 28288 dapB + 29195.0 28374.0 + + 0.544828 1.341176 0.757576 0.496063 50 150 86
29 28343 28374 28343 dapB + 29195.0 28374.0 + + 1.752809 1.933333 1.785714 1.243902 50 150 31
30 29551 29651 29551 carA + 30799.0 29651.0 + + 0.790000 0.430233 0.240310 0.424419 50 150 100
31 29619 29651 29619 carA + 30799.0 29651.0 + + 0.788462 0.435897 0.461957 0.466912 50 150 32
32 30775 30817 30775 carB + 34038.0 30817.0 + + 0.513514 0.761194 0.406593 1.136000 50 150 42
33 34218 34300 34218 caiF + 34695.0 34300.0 + + 0.764706 1.388889 0.357143 0.403846 50 150 82
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3754 4609344 4609414 4609344 prfC + 4611003.0 4609414.0 + + 1.043222 0.723374 0.815589 1.112554 200 300 70
3755 4609356 4609414 4609356 prfC + 4611003.0 4609414.0 + + 1.113715 0.751312 0.856031 1.154138 200 300 58
3756 4611153 4611396 4611153 osmY + 4612001.0 4611396.0 + + 1.070175 1.486726 0.866915 0.928707 200 300 243
3757 4616679 4617323 4616679 deoC + 4618102.0 4617323.0 + + 1.140625 0.526882 1.102041 0.934307 200 300 644
3758 4617278 4617323 4617278 deoC + 4618102.0 4617323.0 + + 1.826840 2.649815 1.678423 2.319249 200 300 45
3759 4619567 4619603 4619567 deoB + 4620826.0 4619603.0 + + 0.520743 0.548993 0.379441 0.564190 200 300 36
3760 4621657 4621769 4621657 yjjJ + 4623100.0 4621769.0 + + 3.920833 14.337209 1.594747 1.465487 200 300 112
3761 4621716 4621769 4621716 yjjJ + 4623100.0 4621769.0 + + 1.156682 2.305970 0.783037 0.875229 200 300 53
3762 4624238 4624238 4624117 lplA - 4624117.0 4623101.0 - - 2.214286 1.905263 1.272436 1.488294 200 300 121
3763 4624799 4624799 4624789 ytjB - 4624789.0 4624145.0 - - 1.145985 1.015267 0.684332 0.680734 200 300 10
3764 4624856 4624895 4624856 serB + 4625863.0 4624895.0 + + 1.471910 2.332288 1.002410 1.592798 200 300 39
3765 4630566 4630566 4630522 yjjK - 4630522.0 4628855.0 - - 0.975590 1.301775 0.613567 0.892770 200 300 44
3766 4630700 4630733 4630700 slt + 4632670.0 4630733.0 + + 0.843023 0.823171 0.951872 0.894191 200 300 33
3767 4632704 4632760 4632704 trpR + 4633086.0 4632760.0 + + 1.302372 1.444231 0.835112 0.975930 200 300 56
3768 4633773 4633773 4633745 yjjX - 4633745.0 4633233.0 - - 3.361868 3.631399 1.012745 1.262749 200 300 28
3769 4633899 4633899 4633745 yjjX - 4633745.0 4633233.0 - - 3.716738 5.043478 1.026196 1.508820 200 300 154
3770 4635243 4635521 4635243 creA + 4635994.0 4635521.0 + + 2.986159 2.272251 1.123288 1.083676 200 300 278
3771 4635353 4635353 4635310 rob - 4635310.0 4634441.0 - - 0.902421 0.560804 0.475946 0.853982 200 300 43
3772 4635477 4635521 4635477 creA + 4635994.0 4635521.0 + + 0.989286 1.091716 0.517182 0.904128 200 300 44
3773 4638160 4638178 4638160 creD + 4639530.0 4638178.0 + + 1.642857 3.800000 1.421053 0.748068 200 300 18
3774 4640358 4640402 4640358 yjjY + 4640542.0 4640402.0 + + 13.010830 11.512545 14.250000 7.067416 200 300 44
3775 4640508 4640508 4640306 arcA - 4640306.0 4639590.0 - - 1.163365 0.827256 0.823056 1.180585 200 300 202
3776 4640512 4640512 4640306 arcA - 4640306.0 4639590.0 - - 1.167142 0.837495 0.831858 1.187599 200 300 206
3777 4640535 4640535 4640306 arcA - 4640306.0 4639590.0 - - 1.057403 0.763457 0.804410 1.089905 200 300 229
3778 4640599 4640599 4640306 arcA - 4640306.0 4639590.0 - - 0.867294 0.786859 0.757342 0.862293 200 300 293
3779 4640681 4640681 4640306 arcA - 4640306.0 4639590.0 - - 0.542907 0.452288 0.399267 0.477665 200 300 375
3780 4640688 4640688 4640306 arcA - 4640306.0 4639590.0 - - 0.515849 0.440549 0.386567 0.455797 200 300 382
3781 4640801 4640801 4640306 arcA - 4640306.0 4639590.0 - - 0.089461 0.110785 0.126010 0.168638 200 300 495
3782 4640838 4640942 4640838 yjtD + 4641628.0 4640942.0 + + 1.639535 0.945946 1.095890 2.051546 200 300 104
3783 4640898 4640942 4640898 yjtD + 4641628.0 4640942.0 + + 1.056604 0.763636 0.952381 1.453782 200 300 44

43812 rows × 16 columns


In [5]:
all_utrs = dfa[['UTR_length', 'TSS', 'gene',
         'ratio_ATCACG', 'ratio_ACAGTG',
         'ratio_CGATGT', 'ratio_GCCAAT',
         'win', 'offset']]

long_utrs = all_utrs[(all_utrs['win'] == 80)
     & (all_utrs['offset'] == 200)
     & (all_utrs['UTR_length'] > 80)
     & ((all_utrs['ratio_ATCACG'] + all_utrs['ratio_ACAGTG']) / 2 >= 1.5) ]

short_utrs = all_utrs[(all_utrs['win'] == 80)
     & (all_utrs['offset'] == 200)
     & (all_utrs['UTR_length'] > 0)
     & (all_utrs['UTR_length'] <= 80)]

In [6]:
long_utrs


Out[6]:
UTR_length TSS gene ratio_ATCACG ratio_ACAGTG ratio_CGATGT ratio_GCCAAT win offset
2 204 5030 yaaX 4.275862 6.000000 0.820000 1.031963 80 200
33 82 34218 caiF 1.466667 1.736842 0.378641 0.500000 80 200
39 215 45592 yaaU 2.333333 3.500000 3.521127 1.899281 80 200
77 288 102867 ftsQ 1.672087 2.145349 1.531915 1.520833 80 200
84 156 117705 ppdD 6.272727 10.100000 1.447257 1.397459 80 200
88 99 121650 aroP 2.810026 2.224396 0.772128 2.149912 80 200
89 120 121671 aroP 2.761506 2.653791 0.675000 2.147002 80 200
94 206 131466 yacH 0.666667 6.500000 0.873418 1.493827 80 200
95 96 131519 acnB 2.233914 1.190070 0.950076 2.142091 80 200
139 133 177757 yadS 1.693182 2.162500 0.848780 1.017460 80 200
153 186 193335 dxr 63.143939 57.567039 34.624434 34.093023 80 200
154 183 193338 dxr 63.918919 58.315341 34.704545 34.058480 80 200
155 110 193411 dxr 70.729064 85.260870 37.570681 35.712230 80 200
159 122 195555 cdsA 4.146018 2.650794 2.969582 3.536842 80 200
160 116 195561 cdsA 4.328704 2.763736 3.075099 3.582888 80 200
162 902 197026 bamA 2.199029 1.814493 2.207650 1.850806 80 200
166 210 208411 accA 1.810316 3.447090 1.783626 1.486111 80 200
172 132 217135 yaeF 61.875000 22.104167 4.588957 2.217213 80 200
176 286 223485 rrsH 13.245690 7.219617 7.841642 9.765564 80 200
177 178 223593 rrsH 109.071429 77.366071 67.159292 68.062500 80 200
206 93 255809 pepD 1.090257 2.019544 0.875000 1.023328 80 200
245 93 310119 ecpB 2.200000 5.875000 2.991379 2.830189 80 200
258 165 318484 ykgB 2.750000 1.133333 0.790055 0.624190 80 200
259 206 318525 ykgB 1.666667 1.909091 0.913043 0.769634 80 200
276 186 335739 yahE 2.500000 2.750000 0.694690 0.931193 80 200
277 152 345252 yahM 87.200000 14.118421 3.755853 2.867596 80 200
303 131 380973 yaiO 0.263158 3.000000 0.786127 0.694737 80 200
307 122 384738 yaiS 3.666667 1.000000 0.669231 1.819355 80 200
320 214 399547 yaiY 4.833333 2.500000 0.809160 0.830688 80 200
340 82 426055 tgt 4.195616 4.621410 1.570605 2.644444 80 200
... ... ... ... ... ... ... ... ... ...
3471 306 4233539 lysC 3.022222 2.016529 0.924901 1.395522 80 200
3483 180 4256457 dgkA 2.803448 1.698598 0.891986 1.279070 80 200
3504 123 4275164 yjcB 1.549296 1.740088 1.137008 0.955912 80 200
3528 206 4301236 mdtP 2.000000 1.666667 0.634731 0.576923 80 200
3536 102 4326836 yjdM 1.027248 2.497512 0.938776 1.046414 80 200
3540 95 4330407 proP 3.777419 2.614916 2.308852 2.415238 80 200
3550 531 4350565 dcuR 2.173333 2.076923 1.377451 2.159420 80 200
3552 194 4352390 yjdK 4.000000 4.250000 4.494253 2.431193 80 200
3578 81 4376794 sugE 1.532609 2.431193 1.131148 1.456704 80 200
3584 94 4382412 frdA 8.224719 4.714286 1.580838 2.391753 80 200
3590 156 4392736 yjeV 1.443478 1.737991 1.192755 1.091853 80 200
3598 200 4399052 miaA 1.084833 2.060811 1.197774 1.136442 80 200
3602 146 4406044 nsrR 3.232323 5.508333 2.173258 1.807471 80 200
3604 196 4416756 yjfP 1.823529 1.518519 0.956522 1.264957 80 200
3628 85 4442261 msrA 1.575472 1.721311 0.746032 1.287402 80 200
3648 101 4467347 treR 2.018182 1.125000 0.549451 1.164179 80 200
3661 164 4479566 yjgN 21.000000 38.000000 1.043478 0.911950 80 200
3665 82 4483919 valS 1.215054 1.822819 0.866803 1.535545 80 200
3666 90 4483927 valS 1.233333 1.988032 0.991632 1.594037 80 200
3702 427 4539928 nanC 4.000000 2.888889 1.863158 1.618557 80 200
3703 290 4540667 fimB 2.538462 0.734043 0.526718 0.811655 80 200
3706 421 4542694 fimA 5.000000 0.600000 0.629630 0.816949 80 200
3708 118 4551518 uxuA 1.426966 1.586207 1.336283 1.520930 80 200
3726 130 4588995 yjiA 3.257576 2.170157 1.777592 2.028616 80 200
3742 227 4602177 yjjB 1.500000 1.714286 2.501566 2.174910 80 200
3743 227 4602177 yjjB 1.500000 1.714286 2.501566 2.174910 80 200
3749 108 4606273 leuV 28.709748 22.305071 4.761766 1.871959 80 200
3760 112 4621657 yjjJ 4.765432 15.809859 1.843658 1.668675 80 200
3770 278 4635243 creA 5.253165 2.875445 3.691667 1.803324 80 200
3782 104 4640838 yjtD 3.416667 1.435484 1.170732 1.753425 80 200

447 rows × 9 columns


In [7]:
short_utrs


Out[7]:
UTR_length TSS gene ratio_ATCACG ratio_ACAGTG ratio_CGATGT ratio_GCCAAT win offset
0 42 148 thrL 2.520732 5.070359 1.262385 3.096360 80 200
1 42 148 thrL 2.520732 5.070359 1.262385 3.096360 80 200
5 58 8017 yaaJ 1.727273 0.531250 0.944444 2.175676 80 200
6 47 8191 talB 0.665059 0.707647 0.740161 0.791085 80 200
10 39 11825 yaaI 1.000000 2.100000 0.559783 0.383459 80 200
14 40 12123 dnaK 0.810839 0.652505 0.512525 0.938957 80 200
15 19 12144 dnaK 0.905255 0.742760 0.588723 1.051353 80 200
18 48 16951 hokC 0.533333 0.545852 0.764202 0.580508 80 200
20 31 17458 nhaA 0.788104 1.322785 0.607143 0.966454 80 200
21 42 21120 rpsT 1.246565 0.828911 0.729745 1.175997 80 200
23 24 21383 ribF 1.215054 1.455535 0.905759 1.371758 80 200
29 31 28343 dapB 1.033898 0.731481 1.011364 0.898551 80 200
31 32 29619 carA 0.802260 0.393519 0.491935 0.507788 80 200
32 42 30775 carB 0.330827 0.662791 0.300725 0.871134 80 200
34 28 35399 caiE 0.500000 0.600000 1.511111 0.980645 80 200
35 69 35440 caiE 0.230769 0.875000 1.655914 1.224359 80 200
38 78 42325 fixA 0.750000 0.250000 0.337209 1.109091 80 200
41 24 49799 folA 1.552036 2.148997 0.355308 0.433635 80 200
42 71 51293 apaH 1.162162 1.146018 1.037147 1.185031 80 200
45 47 57156 lptD 2.341463 2.536632 2.176570 2.242370 80 200
52 23 65803 polB 1.890909 1.492308 1.880795 1.447205 80 200
53 27 70075 araB 1.000000 1.000000 5.000000 9.000000 80 200
55 80 71271 yabI 0.833333 0.637363 0.878431 0.974026 80 200
58 39 77338 sgrR 1.046512 1.030928 0.497175 0.788321 80 200
61 27 83735 leuL 46.666667 54.875000 2.697674 2.983871 80 200
62 27 83735 leuL 46.666667 54.875000 2.697674 2.983871 80 200
66 64 84304 leuO 1.000000 1.272727 0.567901 1.082474 80 200
70 30 85600 ilvI 0.647059 0.361111 1.397059 1.304762 80 200
72 38 89596 mraZ 1.156609 1.069307 1.211009 0.959116 80 200
75 20 91012 ftsL 3.048062 3.515837 3.059226 3.174603 80 200
... ... ... ... ... ... ... ... ... ...
3717 23 4563734 yjiK 0.608696 0.657895 0.827273 0.451613 80 200
3720 46 4571705 yjiS 0.250000 0.076923 1.086957 0.774011 80 200
3721 31 4571720 yjiS 0.200000 0.090909 1.483871 0.962963 80 200
3724 77 4579917 symE 2.513514 3.340426 0.927614 1.638235 80 200
3725 40 4581502 hsdS 0.950000 0.245098 1.310427 1.307027 80 200
3728 68 4591347 yjiY 0.646853 0.627216 0.460072 0.775922 80 200
3731 24 4591633 tsr 0.933333 1.083333 1.040404 1.304954 80 200
3732 15 4594737 yjjL 1.111111 1.500000 0.840832 1.693396 80 200
3738 21 4600210 yjjA 2.500000 3.566667 3.400701 2.101633 80 200
3746 31 4605694 fhuF 1.565217 1.688805 0.986071 1.304343 80 200
3748 27 4605777 yjjZ 0.158025 0.307026 2.212301 1.686863 80 200
3750 31 4606432 leuQ 1.076535 1.402279 2.104099 0.901673 80 200
3751 25 4607725 rsmC 1.677852 1.631858 1.451064 1.636872 80 200
3752 22 4607781 holD 0.920220 1.069501 0.728555 0.888016 80 200
3754 70 4609344 prfC 0.531404 0.509370 0.330834 0.616338 80 200
3755 58 4609356 prfC 0.613364 0.540434 0.371023 0.675325 80 200
3758 45 4617278 deoC 0.487644 0.289855 0.355422 0.737024 80 200
3759 36 4619567 deoB 0.469436 0.628236 0.459067 0.611150 80 200
3761 53 4621716 yjjJ 0.974026 1.263158 0.334211 0.649215 80 200
3763 10 4624799 ytjB 0.828283 0.781609 0.603704 0.821918 80 200
3764 39 4624856 serB 2.815603 4.026144 1.230769 2.281081 80 200
3765 44 4630566 yjjK 1.346298 2.315018 0.702703 1.107946 80 200
3766 33 4630700 slt 0.863248 0.503106 0.649254 0.717262 80 200
3767 56 4632704 trpR 1.522727 1.495522 0.968366 1.058076 80 200
3768 28 4633773 yjjX 5.695312 7.983051 1.144289 2.394521 80 200
3771 43 4635353 rob 0.865867 0.700816 0.478824 0.871409 80 200
3772 44 4635477 creA 1.125828 2.138462 0.378882 0.828704 80 200
3773 18 4638160 creD 2.857143 6.375000 1.546512 1.095930 80 200
3774 44 4640358 yjjY 2.547758 2.093721 3.366534 2.455652 80 200
3783 44 4640898 yjtD 1.900000 0.768293 1.080000 2.679245 80 200

1903 rows × 9 columns

Long UTRs


In [8]:
samples_dict = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
}


res = []
for i,sample in enumerate(samples_dict):
    df = long_utrs[['UTR_length', 'TSS', 'gene']]
    df['loglen'] = np.log10(df['UTR_length'])
    dtmp = long_utrs[['ratio_{}'.format(bc) for bc in samples_dict[sample]]]
    df['mean_ratio'] = dtmp[['ratio_{}'.format(bc) for bc in samples_dict[sample]]].mean(axis=1)
    df['cond'] = sample
    res.append(df)
    
df15 = pd.concat(res)


/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [9]:
df15['logratio'] = np.log10(df15['mean_ratio'])
df15


Out[9]:
UTR_length TSS gene loglen mean_ratio cond logratio
2 204 5030 yaaX 2.309630 0.925982 s9+bcm -0.033398
33 82 34218 caiF 1.913814 0.439320 s9+bcm -0.357219
39 215 45592 yaaU 2.332438 2.710204 s9+bcm 0.433002
77 288 102867 ftsQ 2.459392 1.526374 s9+bcm 0.183661
84 156 117705 ppdD 2.193125 1.422358 s9+bcm 0.153009
88 99 121650 aroP 1.995635 1.461020 s9+bcm 0.164656
89 120 121671 aroP 2.079181 1.411001 s9+bcm 0.149527
94 206 131466 yacH 2.313867 1.183622 s9+bcm 0.073213
95 96 131519 acnB 1.982271 1.546083 s9+bcm 0.189233
139 133 177757 yadS 2.123852 0.933120 s9+bcm -0.030062
153 186 193335 dxr 2.269513 34.358729 s9+bcm 1.536037
154 183 193338 dxr 2.262451 34.381512 s9+bcm 1.536325
155 110 193411 dxr 2.041393 36.641455 s9+bcm 1.563973
159 122 195555 cdsA 2.086360 3.253212 s9+bcm 0.512312
160 116 195561 cdsA 2.064458 3.328993 s9+bcm 0.522313
162 902 197026 bamA 2.955207 2.029228 s9+bcm 0.307331
166 210 208411 accA 2.322219 1.634868 s9+bcm 0.213483
172 132 217135 yaeF 2.120574 3.403085 s9+bcm 0.531873
176 286 223485 rrsH 2.456366 8.803603 s9+bcm 0.944660
177 178 223593 rrsH 2.250420 67.610896 s9+bcm 1.830017
206 93 255809 pepD 1.968483 0.949164 s9+bcm -0.022659
245 93 310119 ecpB 1.968483 2.910784 s9+bcm 0.464010
258 165 318484 ykgB 2.217484 0.707123 s9+bcm -0.150505
259 206 318525 ykgB 2.313867 0.841338 s9+bcm -0.075029
276 186 335739 yahE 2.269513 0.812941 s9+bcm -0.089941
277 152 345252 yahM 2.181844 3.311724 s9+bcm 0.520054
303 131 380973 yaiO 2.117271 0.740432 s9+bcm -0.130515
307 122 384738 yaiS 2.086360 1.244293 s9+bcm 0.094923
320 214 399547 yaiY 2.330414 0.819924 s9+bcm -0.086226
340 82 426055 tgt 1.913814 2.107525 s9+bcm 0.323773
... ... ... ... ... ... ... ...
3471 306 4233539 lysC 2.485721 2.519376 s9 0.401293
3483 180 4256457 dgkA 2.255273 2.251023 s9 0.352380
3504 123 4275164 yjcB 2.089905 1.644692 s9 0.216085
3528 206 4301236 mdtP 2.313867 1.833333 s9 0.263241
3536 102 4326836 yjdM 2.008600 1.762380 s9 0.246100
3540 95 4330407 proP 1.977724 3.196168 s9 0.504630
3550 531 4350565 dcuR 2.725095 2.125128 s9 0.327385
3552 194 4352390 yjdK 2.287802 4.125000 s9 0.615424
3578 81 4376794 sugE 1.908485 1.981901 s9 0.297082
3584 94 4382412 frdA 1.973128 6.469502 s9 0.810871
3590 156 4392736 yjeV 2.193125 1.590735 s9 0.201598
3598 200 4399052 miaA 2.301030 1.572822 s9 0.196680
3602 146 4406044 nsrR 2.164353 4.370328 s9 0.640514
3604 196 4416756 yjfP 2.292256 1.671024 s9 0.222983
3628 85 4442261 msrA 1.929419 1.648392 s9 0.217060
3648 101 4467347 treR 2.004321 1.571591 s9 0.196340
3661 164 4479566 yjgN 2.214844 29.500000 s9 1.469822
3665 82 4483919 valS 1.913814 1.518936 s9 0.181540
3666 90 4483927 valS 1.954243 1.610683 s9 0.207010
3702 427 4539928 nanC 2.630428 3.444444 s9 0.537119
3703 290 4540667 fimB 2.462398 1.636252 s9 0.213850
3706 421 4542694 fimA 2.624282 2.800000 s9 0.447158
3708 118 4551518 uxuA 2.071882 1.506587 s9 0.177994
3726 130 4588995 yjiA 2.113943 2.713866 s9 0.433588
3742 227 4602177 yjjB 2.356026 1.607143 s9 0.206054
3743 227 4602177 yjjB 2.356026 1.607143 s9 0.206054
3749 108 4606273 leuV 2.033424 25.507410 s9 1.406666
3760 112 4621657 yjjJ 2.049218 10.287646 s9 1.012316
3770 278 4635243 creA 2.444045 4.064305 s9 0.608986
3782 104 4640838 yjtD 2.017033 2.426075 s9 0.384904

894 rows × 7 columns


In [10]:
df15.loc[df15.cond == 's9', 'cond'] = '-bcm'
df15.loc[df15.cond == 's9+bcm', 'cond'] = '+bcm'
#ldf15 = df15[df15['UTR_length'] > 80]

In [12]:
def mark_rho(rec):
    if rec['gene'] == 'rpoS' and rec['UTR_length'] > 500:
        return 'rpoS'
    else:
        return ''

    
    
df15['label'] = df15.apply(mark_rho, axis=1)

p = ggplot(df15[df15['UTR_length'] < 650], aes(x='UTR_length', y='logratio', color='cond', label='label')) \
    + geom_point(alpha=0.25) \
    + geom_text(color="black", nudge_x=30, size=18) \
    + geom_smooth(method='lowess', span=1/5., size=3) \
    + xlab("5' UTR length") \
    + ylab("log(proximal/distal)") \
    + theme(axis_title=element_text(size=28),
            axis_text=element_text(size=24))
print(p)


<ggplot: (-9223363288333652245)>

In [17]:
df15.to_csv('../results/long_utrs.df15.csv')

Short UTRs


In [13]:
res = []
for i,sample in enumerate(samples_dict):
    df = short_utrs[['UTR_length', 'TSS', 'gene']]
    df['loglen'] = np.log10(df['UTR_length'])
    dtmp = short_utrs[['ratio_{}'.format(bc) for bc in samples_dict[sample]]]
    df['mean_ratio'] = dtmp[['ratio_{}'.format(bc) for bc in samples_dict[sample]]].mean(axis=1)
    df['cond'] = sample
    res.append(df)
    
sdf15 = pd.concat(res)


/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/ilya/.venv/pydata/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [14]:
sdf15['logratio'] = np.log10(sdf15['mean_ratio'])
sdf15.loc[sdf15.cond == 's9', 'cond'] = '-bcm'
sdf15.loc[sdf15.cond == 's9+bcm', 'cond'] = '+bcm'
sdf15


Out[14]:
UTR_length TSS gene loglen mean_ratio cond logratio
0 42 148 thrL 1.623249 2.179373 +bcm 0.338331
1 42 148 thrL 1.623249 2.179373 +bcm 0.338331
5 58 8017 yaaJ 1.763428 1.560060 +bcm 0.193141
6 47 8191 talB 1.672098 0.765623 +bcm -0.115985
10 39 11825 yaaI 1.591065 0.471621 +bcm -0.326407
14 40 12123 dnaK 1.602060 0.725741 +bcm -0.139218
15 19 12144 dnaK 1.278754 0.820038 +bcm -0.086166
18 48 16951 hokC 1.681241 0.672355 +bcm -0.172401
20 31 17458 nhaA 1.491362 0.786798 +bcm -0.104137
21 42 21120 rpsT 1.623249 0.952871 +bcm -0.020966
23 24 21383 ribF 1.380211 1.138759 +bcm 0.056432
29 31 28343 dapB 1.491362 0.954957 +bcm -0.020016
31 32 29619 carA 1.505150 0.499862 +bcm -0.301150
32 42 30775 carB 1.623249 0.585929 +bcm -0.232155
34 28 35399 caiE 1.447158 1.245878 +bcm 0.095476
35 69 35440 caiE 1.838849 1.440136 +bcm 0.158404
38 78 42325 fixA 1.892095 0.723150 +bcm -0.140772
41 24 49799 folA 1.380211 0.394472 +bcm -0.403984
42 71 51293 apaH 1.851258 1.111089 +bcm 0.045749
45 47 57156 lptD 1.672098 2.209470 +bcm 0.344288
52 23 65803 polB 1.361728 1.664000 +bcm 0.221153
53 27 70075 araB 1.431364 7.000000 +bcm 0.845098
55 80 71271 yabI 1.903090 0.926229 +bcm -0.033282
58 39 77338 sgrR 1.591065 0.642748 +bcm -0.191959
61 27 83735 leuL 1.431364 2.840773 +bcm 0.453436
62 27 83735 leuL 1.431364 2.840773 +bcm 0.453436
66 64 84304 leuO 1.806180 0.825188 +bcm -0.083447
70 30 85600 ilvI 1.477121 1.350910 +bcm 0.130627
72 38 89596 mraZ 1.579784 1.085063 +bcm 0.035455
75 20 91012 ftsL 1.301030 3.116914 +bcm 0.493725
... ... ... ... ... ... ... ...
3717 23 4563734 yjiK 1.361728 0.633295 -bcm -0.198394
3720 46 4571705 yjiS 1.662758 0.163462 -bcm -0.786584
3721 31 4571720 yjiS 1.491362 0.145455 -bcm -0.837273
3724 77 4579917 symE 1.886491 2.926970 -bcm 0.466418
3725 40 4581502 hsdS 1.602060 0.597549 -bcm -0.223626
3728 68 4591347 yjiY 1.832509 0.637035 -bcm -0.195837
3731 24 4591633 tsr 1.380211 1.008333 -bcm 0.003604
3732 15 4594737 yjjL 1.176091 1.305556 -bcm 0.115795
3738 21 4600210 yjjA 1.322219 3.033333 -bcm 0.481920
3746 31 4605694 fhuF 1.491362 1.627011 -bcm 0.211390
3748 27 4605777 yjjZ 1.431364 0.232525 -bcm -0.633530
3750 31 4606432 leuQ 1.491362 1.239407 -bcm 0.093214
3751 25 4607725 rsmC 1.397940 1.654855 -bcm 0.218760
3752 22 4607781 holD 1.342423 0.994861 -bcm -0.002238
3754 70 4609344 prfC 1.845098 0.520387 -bcm -0.283674
3755 58 4609356 prfC 1.763428 0.576899 -bcm -0.238900
3758 45 4617278 deoC 1.653213 0.388750 -bcm -0.410330
3759 36 4619567 deoB 1.556303 0.548836 -bcm -0.260558
3761 53 4621716 yjjJ 1.724276 1.118592 -bcm 0.048672
3763 10 4624799 ytjB 1.000000 0.804946 -bcm -0.094233
3764 39 4624856 serB 1.591065 3.420873 -bcm 0.534137
3765 44 4630566 yjjK 1.643453 1.830658 -bcm 0.262607
3766 33 4630700 slt 1.518514 0.683177 -bcm -0.165467
3767 56 4632704 trpR 1.748188 1.509125 -bcm 0.178725
3768 28 4633773 yjjX 1.447158 6.839182 -bcm 0.835004
3771 43 4635353 rob 1.633468 0.783341 -bcm -0.106049
3772 44 4635477 creA 1.643453 1.632145 -bcm 0.212759
3773 18 4638160 creD 1.255273 4.616071 -bcm 0.664273
3774 44 4640358 yjjY 1.643453 2.320739 -bcm 0.365626
3783 44 4640898 yjtD 1.643453 1.334146 -bcm 0.125203

3806 rows × 7 columns


In [26]:
p = ggplot(sdf15, aes(x='UTR_length', y='logratio', color='cond')) \
    + geom_point(alpha=0.25) \
    + geom_smooth(method='lowess', span=1/5., size=3) \
    + xlab("5' UTR length") \
    + ylab("log(proximal/distal)") \
    + theme(axis_title=element_text(size=28),
            axis_text=element_text(size=24))
print(p)


<ggplot: (-9223363288220070270)>

In [ ]: