In [1]:
import pandas as pd
import natsort as ns #3rd party package for natural sorting
import re
In [2]:
data = pd.read_csv("5G_counts.tsv", sep = "\t")
In [3]:
columns_list = list(range(0,9)) + list(range(20,42)) #creating a list of columns that I care about (see below)
data_1 = data.iloc[:, columns_list] #taking only 0-8 and 20-42 columns removing old FM runs
In [4]:
#Want to sort the data columns (20 - 42) in their timely order. Will split into 2 dataframes, sort, then put together.
first_8 = data_1.iloc[:, 0:9] #new data frame with first 9 columns
remaining_data = data_1.iloc[:,9:] # new data frame with remaining columns (to be sorted)
cols = list(ns.natsorted(remaining_data.columns)) #using natural sort package
newdf=remaining_data[cols]
In [5]:
data_2 = pd.concat([first_8, newdf], axis = 1) #ok so now combined first 8 columns with FM34 and FM40 columns
In [6]:
#the columns still contain many QC runs lets get rid of them (see aside for removing columins with "QC" in them)
list(data_2.columns)
Out[6]:
['locus_tag',
'product',
'type',
'gene_symbol',
'locus',
'start_coord',
'end_coord',
'note',
'translation',
'5GB1_FM34_T0_TR1_QC',
'5GB1_FM34_T3_TR3_QC',
'5GB1_FM34_T4_TR3_QC',
'5GB1_FM34_T5_TR2_QC',
'5GB1_FM34_T6_TR3_QC',
'5GB1_FM34_T7_TR3_QC',
'5GB1_FM34_T8_TR1_QC',
'5GB1_FM40_T0_TR1_QC',
'5GB1_FM40_T0m_TR2',
'5GB1_FM40_T10m_TR3',
'5GB1_FM40_T10m_TR3_QC',
'5GB1_FM40_T20m_TR2',
'5GB1_FM40_T40m_TR1',
'5GB1_FM40_T40m_TR1_QC',
'5GB1_FM40_T60m_TR1',
'5GB1_FM40_T60m_TR1_QC',
'5GB1_FM40_T90m_TR2',
'5GB1_FM40_T90m_TR2_QC',
'5GB1_FM40_T150m_TR1_QC',
'5GB1_FM40_T150m_TR1_remake',
'5GB1_FM40_T180m_TR1',
'5GB1_FM40_T180m_TR1_QC']
In [7]:
data_3 = data_2.select(lambda x: not re.search("QC", x), axis = 1)
In [8]:
data_3
Out[8]:
locus_tag
product
type
gene_symbol
locus
start_coord
end_coord
note
translation
5GB1_FM40_T0m_TR2
5GB1_FM40_T10m_TR3
5GB1_FM40_T20m_TR2
5GB1_FM40_T40m_TR1
5GB1_FM40_T60m_TR1
5GB1_FM40_T90m_TR2
5GB1_FM40_T150m_TR1_remake
5GB1_FM40_T180m_TR1
0
MBURv2_100001
conserved protein of unknown function
CDS
NaN
MBURv2
1965161
1965952
Evidence 4 : Homologs of previously reported g...
NaN
648
254
301
248
513
294
852
322
1
MBURv2_100002
conserved protein of unknown function
CDS
NaN
MBURv2
1966190
1966369
Evidence 4 : Homologs of previously reported g...
NaN
45
20
13
15
21
18
40
16
2
MBURv2_100003
protein of unknown function
CDS
NaN
MBURv2
1966931
1967041
Evidence 5 : No homology to any previously rep...
NaN
20
7
18
4
12
6
30
2
3
MBURv2_10001
protein of unknown function
CDS
NaN
MBURv2
116
289
Evidence 5 : No homology to any previously rep...
NaN
88
63
61
45
67
38
87
25
4
MBURv2_10002
KfrB
CDS
kfrB
MBURv2
497
844
NaN
NaN
1061
504
537
526
780
667
1497
707
5
MBURv2_10003
Protein traN
CDS
NaN
MBURv2
875
1594
NaN
NaN
2771
1053
1385
949
2194
1331
3626
1315
6
MBURv2_10004
Protein TraM
CDS
traM
MBURv2
1631
2071
NaN
NaN
385
178
213
175
333
220
516
185
7
MBURv2_10005
Protein TraL
CDS
traL
MBURv2
2071
2796
NaN
NaN
551
252
278
207
451
310
743
332
8
MBURv2_10006
Protein TraK
CDS
traK
MBURv2
2796
3176
NaN
NaN
472
164
166
255
268
276
621
304
9
MBURv2_10007
Protein TraJ
CDS
traJ
MBURv2
3508
3876
NaN
NaN
253
90
81
136
124
144
307
162
10
MBURv2_10008
Protein TraI
CDS
traI
MBURv2
3911
6112
NaN
NaN
868
335
376
417
575
464
1043
430
11
MBURv2_10009
Conjugal transfer protein TraG
CDS
traG
MBURv2
6117
8018
NaN
NaN
960
425
433
434
770
581
1284
538
12
MBURv2_10010
Plasmid transfer protein TraF
CDS
traF
MBURv2
8015
8548
NaN
NaN
174
58
95
56
146
79
173
69
13
MBURv2_10011
DNA topoisomerase 3
CDS
topB
MBURv2
8558
10627
NaN
NaN
315
146
194
139
244
190
424
149
14
MBURv2_10012
TraD protein
CDS
NaN
MBURv2
10635
10853
NaN
NaN
26
20
17
13
27
17
64
12
15
MBURv2_10013
DNA primase traC (Modular protein)
CDS
NaN
MBURv2
10856
18922
NaN
NaN
1378
700
719
615
1017
779
2007
766
16
MBURv2_10014
conserved protein of unknown function
CDS
NaN
MBURv2
19300
19908
Evidence 4 : Homologs of previously reported g...
NaN
424
192
241
165
304
190
449
185
17
MBURv2_10015
Glutathione-dependent formaldehyde-activating,...
CDS
NaN
MBURv2
19973
20371
NaN
NaN
358
131
161
117
234
141
315
110
18
MBURv2_10016
putative antioxidant peroxiredoxin-related pro...
CDS
NaN
MBURv2
20447
21028
Evidence 3 : Function proposed based on presen...
NaN
459
172
222
152
292
196
469
214
19
MBURv2_10017
conserved protein of unknown function
CDS
NaN
MBURv2
21067
21537
Evidence 4 : Homologs of previously reported g...
NaN
784
284
327
248
518
290
561
294
20
MBURv2_10018
Pyridoxamine 5'-phosphate oxidase-related FMN-...
CDS
NaN
MBURv2
21622
22173
NaN
NaN
446
229
299
264
431
355
557
329
21
MBURv2_10019
protein of unknown function
CDS
NaN
MBURv2
22227
22721
Evidence 5 : No homology to any previously rep...
NaN
312
139
184
110
254
164
370
124
22
MBURv2_10020
Methyltransferase type 11
CDS
NaN
MBURv2
22801
23409
NaN
NaN
498
238
280
208
418
368
510
290
23
MBURv2_10021
conserved membrane protein of unknown function
CDS
NaN
MBURv2
23485
24141
Evidence 4 : Homologs of previously reported g...
NaN
664
237
362
273
597
441
734
489
24
MBURv2_10022
Methyltransferase domain family
CDS
NaN
MBURv2
24146
24862
NaN
NaN
2348
951
1151
1152
1784
1744
2964
1590
25
MBURv2_10023
Plasmid stabilization system
CDS
NaN
MBURv2
25669
25962
NaN
NaN
528
233
220
263
409
314
692
258
26
MBURv2_10024
conserved protein of unknown function
CDS
NaN
MBURv2
25950
26222
Evidence 4 : Homologs of previously reported g...
NaN
844
359
468
369
566
491
1115
474
27
MBURv2_10025
Phage integrase
CDS
NaN
MBURv2
26419
27387
NaN
NaN
702
286
372
228
542
345
932
272
28
MBURv2_10026
protein of unknown function
CDS
NaN
MBURv2
27445
27723
Evidence 5 : No homology to any previously rep...
NaN
318
138
167
127
222
151
436
147
29
MBURv2_10027
O-methyltransferase family 2
CDS
NaN
MBURv2
27772
28890
NaN
NaN
696
323
368
253
542
321
868
296
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4563
MBURv2_tRNA22
Ile tRNA
tRNA
NaN
MBURv2
3349301
3349377
NaN
NaN
0
0
0
0
0
0
0
0
4564
MBURv2_tRNA23
Ala tRNA
tRNA
NaN
MBURv2
3349154
3349229
NaN
NaN
0
0
0
0
0
0
0
0
4565
MBURv2_tRNA24
Arg tRNA
tRNA
NaN
MBURv2
3031974
3032050
NaN
NaN
137
97
74
85
147
133
289
109
4566
MBURv2_tRNA25
Ile tRNA
tRNA
NaN
MBURv2
2890542
2890618
NaN
NaN
0
0
0
0
0
0
0
0
4567
MBURv2_tRNA26
Ala tRNA
tRNA
NaN
MBURv2
2890395
2890470
NaN
NaN
0
0
0
0
0
0
0
0
4568
MBURv2_tRNA27
Gln tRNA
tRNA
NaN
MBURv2
2672017
2672091
NaN
NaN
3374
1413
1645
1787
4081
3149
7394
3245
4569
MBURv2_tRNA28
Tyr tRNA
tRNA
NaN
MBURv2
2670122
2670206
NaN
NaN
5354
3280
2796
3274
5969
5492
9447
5636
4570
MBURv2_tRNA29
Gly tRNA
tRNA
NaN
MBURv2
2670028
2670101
NaN
NaN
6697
2987
3812
2966
7731
5205
7122
5163
4571
MBURv2_tRNA3
Asp tRNA
tRNA
NaN
MBURv2
113588
113664
NaN
NaN
1384
1096
885
985
1193
1269
4745
1300
4572
MBURv2_tRNA30
Thr tRNA
tRNA
NaN
MBURv2
2669935
2670010
NaN
NaN
8283
3676
4629
3472
9121
6140
12777
6316
4573
MBURv2_tRNA31
Trp tRNA
tRNA
NaN
MBURv2
2668594
2668669
NaN
NaN
3820
1698
1908
1729
4660
2576
5507
3018
4574
MBURv2_tRNA32
Met tRNA
tRNA
NaN
MBURv2
2370229
2370305
NaN
NaN
483
181
254
203
530
304
844
322
4575
MBURv2_tRNA33
Ala tRNA
tRNA
NaN
MBURv2
2259751
2259826
NaN
NaN
3446
3658
2378
3652
4588
7806
8736
6036
4576
MBURv2_tRNA34
Glu tRNA
tRNA
NaN
MBURv2
2259657
2259732
NaN
NaN
2294
2463
1909
2374
3196
4997
6801
4018
4577
MBURv2_tRNA35
Glu tRNA
tRNA
NaN
MBURv2
2259528
2259603
NaN
NaN
1006
2283
988
2287
1937
4563
5875
3394
4578
MBURv2_tRNA36
Val tRNA
tRNA
NaN
MBURv2
1849580
1849656
NaN
NaN
1146
625
615
621
1312
1415
2075
1173
4579
MBURv2_tRNA37
Pro tRNA
tRNA
NaN
MBURv2
1841903
1841979
NaN
NaN
891
533
520
512
787
616
1344
587
4580
MBURv2_tRNA38
Ser tRNA
tRNA
NaN
MBURv2
1721090
1721181
NaN
NaN
3153
1162
1735
1299
3198
1699
5480
1667
4581
MBURv2_tRNA39
Arg tRNA
tRNA
NaN
MBURv2
1720933
1721009
NaN
NaN
1505
966
927
1162
1589
1737
5039
1892
4582
MBURv2_tRNA4
Asp tRNA
tRNA
NaN
MBURv2
364245
364321
NaN
NaN
112
27
57
30
116
42
127
32
4583
MBURv2_tRNA40
Asn tRNA
tRNA
NaN
MBURv2
1720839
1720914
NaN
NaN
248
300
172
391
269
582
1314
750
4584
MBURv2_tRNA41
Leu tRNA
tRNA
NaN
MBURv2
1364317
1364401
NaN
NaN
1028
701
544
695
1070
998
2235
970
4585
MBURv2_tRNA42
Gly tRNA
tRNA
NaN
MBURv2
1295010
1295084
NaN
NaN
304
217
219
253
374
353
1194
424
4586
MBURv2_tRNA43
Cys tRNA
tRNA
NaN
MBURv2
1294889
1294962
NaN
NaN
71
56
69
42
74
95
242
135
4587
MBURv2_tRNA44
Leu tRNA
tRNA
NaN
MBURv2
688937
689023
NaN
NaN
76
40
43
47
91
94
186
90
4588
MBURv2_tRNA5
Met tRNA
tRNA
NaN
MBURv2
1082864
1082940
NaN
NaN
600
480
342
466
761
824
1726
761
4589
MBURv2_tRNA6
Ser tRNA
tRNA
NaN
MBURv2
1324017
1324107
NaN
NaN
633
517
402
496
784
1018
1155
898
4590
MBURv2_tRNA7
Leu tRNA
tRNA
NaN
MBURv2
2096904
2096990
NaN
NaN
692
344
348
304
781
578
1328
556
4591
MBURv2_tRNA8
Ser tRNA
tRNA
NaN
MBURv2
2881884
2881971
NaN
NaN
202
481
116
555
193
1018
1424
787
4592
MBURv2_tRNA9
Thr tRNA
tRNA
NaN
MBURv2
3193201
3193276
NaN
NaN
0
0
0
0
0
0
0
0
4593 rows × 17 columns
In [9]:
#OK now need to create a TPM counts for all the columns
data_3["gene_length"] = (data_3["end_coord"]-data_3["start_coord"] + 1)/1000 #gene length in kilo base pair
In [10]:
data_3
Out[10]:
locus_tag
product
type
gene_symbol
locus
start_coord
end_coord
note
translation
5GB1_FM40_T0m_TR2
5GB1_FM40_T10m_TR3
5GB1_FM40_T20m_TR2
5GB1_FM40_T40m_TR1
5GB1_FM40_T60m_TR1
5GB1_FM40_T90m_TR2
5GB1_FM40_T150m_TR1_remake
5GB1_FM40_T180m_TR1
gene_length
0
MBURv2_100001
conserved protein of unknown function
CDS
NaN
MBURv2
1965161
1965952
Evidence 4 : Homologs of previously reported g...
NaN
648
254
301
248
513
294
852
322
0.792
1
MBURv2_100002
conserved protein of unknown function
CDS
NaN
MBURv2
1966190
1966369
Evidence 4 : Homologs of previously reported g...
NaN
45
20
13
15
21
18
40
16
0.180
2
MBURv2_100003
protein of unknown function
CDS
NaN
MBURv2
1966931
1967041
Evidence 5 : No homology to any previously rep...
NaN
20
7
18
4
12
6
30
2
0.111
3
MBURv2_10001
protein of unknown function
CDS
NaN
MBURv2
116
289
Evidence 5 : No homology to any previously rep...
NaN
88
63
61
45
67
38
87
25
0.174
4
MBURv2_10002
KfrB
CDS
kfrB
MBURv2
497
844
NaN
NaN
1061
504
537
526
780
667
1497
707
0.348
5
MBURv2_10003
Protein traN
CDS
NaN
MBURv2
875
1594
NaN
NaN
2771
1053
1385
949
2194
1331
3626
1315
0.720
6
MBURv2_10004
Protein TraM
CDS
traM
MBURv2
1631
2071
NaN
NaN
385
178
213
175
333
220
516
185
0.441
7
MBURv2_10005
Protein TraL
CDS
traL
MBURv2
2071
2796
NaN
NaN
551
252
278
207
451
310
743
332
0.726
8
MBURv2_10006
Protein TraK
CDS
traK
MBURv2
2796
3176
NaN
NaN
472
164
166
255
268
276
621
304
0.381
9
MBURv2_10007
Protein TraJ
CDS
traJ
MBURv2
3508
3876
NaN
NaN
253
90
81
136
124
144
307
162
0.369
10
MBURv2_10008
Protein TraI
CDS
traI
MBURv2
3911
6112
NaN
NaN
868
335
376
417
575
464
1043
430
2.202
11
MBURv2_10009
Conjugal transfer protein TraG
CDS
traG
MBURv2
6117
8018
NaN
NaN
960
425
433
434
770
581
1284
538
1.902
12
MBURv2_10010
Plasmid transfer protein TraF
CDS
traF
MBURv2
8015
8548
NaN
NaN
174
58
95
56
146
79
173
69
0.534
13
MBURv2_10011
DNA topoisomerase 3
CDS
topB
MBURv2
8558
10627
NaN
NaN
315
146
194
139
244
190
424
149
2.070
14
MBURv2_10012
TraD protein
CDS
NaN
MBURv2
10635
10853
NaN
NaN
26
20
17
13
27
17
64
12
0.219
15
MBURv2_10013
DNA primase traC (Modular protein)
CDS
NaN
MBURv2
10856
18922
NaN
NaN
1378
700
719
615
1017
779
2007
766
8.067
16
MBURv2_10014
conserved protein of unknown function
CDS
NaN
MBURv2
19300
19908
Evidence 4 : Homologs of previously reported g...
NaN
424
192
241
165
304
190
449
185
0.609
17
MBURv2_10015
Glutathione-dependent formaldehyde-activating,...
CDS
NaN
MBURv2
19973
20371
NaN
NaN
358
131
161
117
234
141
315
110
0.399
18
MBURv2_10016
putative antioxidant peroxiredoxin-related pro...
CDS
NaN
MBURv2
20447
21028
Evidence 3 : Function proposed based on presen...
NaN
459
172
222
152
292
196
469
214
0.582
19
MBURv2_10017
conserved protein of unknown function
CDS
NaN
MBURv2
21067
21537
Evidence 4 : Homologs of previously reported g...
NaN
784
284
327
248
518
290
561
294
0.471
20
MBURv2_10018
Pyridoxamine 5'-phosphate oxidase-related FMN-...
CDS
NaN
MBURv2
21622
22173
NaN
NaN
446
229
299
264
431
355
557
329
0.552
21
MBURv2_10019
protein of unknown function
CDS
NaN
MBURv2
22227
22721
Evidence 5 : No homology to any previously rep...
NaN
312
139
184
110
254
164
370
124
0.495
22
MBURv2_10020
Methyltransferase type 11
CDS
NaN
MBURv2
22801
23409
NaN
NaN
498
238
280
208
418
368
510
290
0.609
23
MBURv2_10021
conserved membrane protein of unknown function
CDS
NaN
MBURv2
23485
24141
Evidence 4 : Homologs of previously reported g...
NaN
664
237
362
273
597
441
734
489
0.657
24
MBURv2_10022
Methyltransferase domain family
CDS
NaN
MBURv2
24146
24862
NaN
NaN
2348
951
1151
1152
1784
1744
2964
1590
0.717
25
MBURv2_10023
Plasmid stabilization system
CDS
NaN
MBURv2
25669
25962
NaN
NaN
528
233
220
263
409
314
692
258
0.294
26
MBURv2_10024
conserved protein of unknown function
CDS
NaN
MBURv2
25950
26222
Evidence 4 : Homologs of previously reported g...
NaN
844
359
468
369
566
491
1115
474
0.273
27
MBURv2_10025
Phage integrase
CDS
NaN
MBURv2
26419
27387
NaN
NaN
702
286
372
228
542
345
932
272
0.969
28
MBURv2_10026
protein of unknown function
CDS
NaN
MBURv2
27445
27723
Evidence 5 : No homology to any previously rep...
NaN
318
138
167
127
222
151
436
147
0.279
29
MBURv2_10027
O-methyltransferase family 2
CDS
NaN
MBURv2
27772
28890
NaN
NaN
696
323
368
253
542
321
868
296
1.119
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4563
MBURv2_tRNA22
Ile tRNA
tRNA
NaN
MBURv2
3349301
3349377
NaN
NaN
0
0
0
0
0
0
0
0
0.077
4564
MBURv2_tRNA23
Ala tRNA
tRNA
NaN
MBURv2
3349154
3349229
NaN
NaN
0
0
0
0
0
0
0
0
0.076
4565
MBURv2_tRNA24
Arg tRNA
tRNA
NaN
MBURv2
3031974
3032050
NaN
NaN
137
97
74
85
147
133
289
109
0.077
4566
MBURv2_tRNA25
Ile tRNA
tRNA
NaN
MBURv2
2890542
2890618
NaN
NaN
0
0
0
0
0
0
0
0
0.077
4567
MBURv2_tRNA26
Ala tRNA
tRNA
NaN
MBURv2
2890395
2890470
NaN
NaN
0
0
0
0
0
0
0
0
0.076
4568
MBURv2_tRNA27
Gln tRNA
tRNA
NaN
MBURv2
2672017
2672091
NaN
NaN
3374
1413
1645
1787
4081
3149
7394
3245
0.075
4569
MBURv2_tRNA28
Tyr tRNA
tRNA
NaN
MBURv2
2670122
2670206
NaN
NaN
5354
3280
2796
3274
5969
5492
9447
5636
0.085
4570
MBURv2_tRNA29
Gly tRNA
tRNA
NaN
MBURv2
2670028
2670101
NaN
NaN
6697
2987
3812
2966
7731
5205
7122
5163
0.074
4571
MBURv2_tRNA3
Asp tRNA
tRNA
NaN
MBURv2
113588
113664
NaN
NaN
1384
1096
885
985
1193
1269
4745
1300
0.077
4572
MBURv2_tRNA30
Thr tRNA
tRNA
NaN
MBURv2
2669935
2670010
NaN
NaN
8283
3676
4629
3472
9121
6140
12777
6316
0.076
4573
MBURv2_tRNA31
Trp tRNA
tRNA
NaN
MBURv2
2668594
2668669
NaN
NaN
3820
1698
1908
1729
4660
2576
5507
3018
0.076
4574
MBURv2_tRNA32
Met tRNA
tRNA
NaN
MBURv2
2370229
2370305
NaN
NaN
483
181
254
203
530
304
844
322
0.077
4575
MBURv2_tRNA33
Ala tRNA
tRNA
NaN
MBURv2
2259751
2259826
NaN
NaN
3446
3658
2378
3652
4588
7806
8736
6036
0.076
4576
MBURv2_tRNA34
Glu tRNA
tRNA
NaN
MBURv2
2259657
2259732
NaN
NaN
2294
2463
1909
2374
3196
4997
6801
4018
0.076
4577
MBURv2_tRNA35
Glu tRNA
tRNA
NaN
MBURv2
2259528
2259603
NaN
NaN
1006
2283
988
2287
1937
4563
5875
3394
0.076
4578
MBURv2_tRNA36
Val tRNA
tRNA
NaN
MBURv2
1849580
1849656
NaN
NaN
1146
625
615
621
1312
1415
2075
1173
0.077
4579
MBURv2_tRNA37
Pro tRNA
tRNA
NaN
MBURv2
1841903
1841979
NaN
NaN
891
533
520
512
787
616
1344
587
0.077
4580
MBURv2_tRNA38
Ser tRNA
tRNA
NaN
MBURv2
1721090
1721181
NaN
NaN
3153
1162
1735
1299
3198
1699
5480
1667
0.092
4581
MBURv2_tRNA39
Arg tRNA
tRNA
NaN
MBURv2
1720933
1721009
NaN
NaN
1505
966
927
1162
1589
1737
5039
1892
0.077
4582
MBURv2_tRNA4
Asp tRNA
tRNA
NaN
MBURv2
364245
364321
NaN
NaN
112
27
57
30
116
42
127
32
0.077
4583
MBURv2_tRNA40
Asn tRNA
tRNA
NaN
MBURv2
1720839
1720914
NaN
NaN
248
300
172
391
269
582
1314
750
0.076
4584
MBURv2_tRNA41
Leu tRNA
tRNA
NaN
MBURv2
1364317
1364401
NaN
NaN
1028
701
544
695
1070
998
2235
970
0.085
4585
MBURv2_tRNA42
Gly tRNA
tRNA
NaN
MBURv2
1295010
1295084
NaN
NaN
304
217
219
253
374
353
1194
424
0.075
4586
MBURv2_tRNA43
Cys tRNA
tRNA
NaN
MBURv2
1294889
1294962
NaN
NaN
71
56
69
42
74
95
242
135
0.074
4587
MBURv2_tRNA44
Leu tRNA
tRNA
NaN
MBURv2
688937
689023
NaN
NaN
76
40
43
47
91
94
186
90
0.087
4588
MBURv2_tRNA5
Met tRNA
tRNA
NaN
MBURv2
1082864
1082940
NaN
NaN
600
480
342
466
761
824
1726
761
0.077
4589
MBURv2_tRNA6
Ser tRNA
tRNA
NaN
MBURv2
1324017
1324107
NaN
NaN
633
517
402
496
784
1018
1155
898
0.091
4590
MBURv2_tRNA7
Leu tRNA
tRNA
NaN
MBURv2
2096904
2096990
NaN
NaN
692
344
348
304
781
578
1328
556
0.087
4591
MBURv2_tRNA8
Ser tRNA
tRNA
NaN
MBURv2
2881884
2881971
NaN
NaN
202
481
116
555
193
1018
1424
787
0.088
4592
MBURv2_tRNA9
Thr tRNA
tRNA
NaN
MBURv2
3193201
3193276
NaN
NaN
0
0
0
0
0
0
0
0
0.076
4593 rows × 18 columns
In [11]:
#before moving on, I want to see the stats for the gene length column (min, max, mean, etc.)
data_3.gene_length.describe()
Out[11]:
count 4593.000000
mean 0.920235
std 0.777701
min 0.063000
25% 0.390000
50% 0.738000
75% 1.218000
max 10.320000
Name: gene_length, dtype: float64
In [12]:
#lets find the loc range of the columns I want to divide and my gene length column
print(data_3.columns.get_loc("gene_length")) # need to devide all FM40 columns by this column
print(data_3.columns.get_loc("5GB1_FM40_T0m_TR2")) # this is where my range starts. so columns [9-16]/[17]
17
9
In [13]:
RPK = data_3.iloc[:,9:17].div(data_3.gene_length, axis=0) #it is 9-17 because the last value is not inclusive.
In [14]:
data_4 = pd.concat([first_8, RPK], axis = 1)
In [15]:
data_4
Out[15]:
locus_tag
product
type
gene_symbol
locus
start_coord
end_coord
note
translation
5GB1_FM40_T0m_TR2
5GB1_FM40_T10m_TR3
5GB1_FM40_T20m_TR2
5GB1_FM40_T40m_TR1
5GB1_FM40_T60m_TR1
5GB1_FM40_T90m_TR2
5GB1_FM40_T150m_TR1_remake
5GB1_FM40_T180m_TR1
0
MBURv2_100001
conserved protein of unknown function
CDS
NaN
MBURv2
1965161
1965952
Evidence 4 : Homologs of previously reported g...
NaN
818.181818
320.707071
380.050505
313.131313
647.727273
371.212121
1075.757576
406.565657
1
MBURv2_100002
conserved protein of unknown function
CDS
NaN
MBURv2
1966190
1966369
Evidence 4 : Homologs of previously reported g...
NaN
250.000000
111.111111
72.222222
83.333333
116.666667
100.000000
222.222222
88.888889
2
MBURv2_100003
protein of unknown function
CDS
NaN
MBURv2
1966931
1967041
Evidence 5 : No homology to any previously rep...
NaN
180.180180
63.063063
162.162162
36.036036
108.108108
54.054054
270.270270
18.018018
3
MBURv2_10001
protein of unknown function
CDS
NaN
MBURv2
116
289
Evidence 5 : No homology to any previously rep...
NaN
505.747126
362.068966
350.574713
258.620690
385.057471
218.390805
500.000000
143.678161
4
MBURv2_10002
KfrB
CDS
kfrB
MBURv2
497
844
NaN
NaN
3048.850575
1448.275862
1543.103448
1511.494253
2241.379310
1916.666667
4301.724138
2031.609195
5
MBURv2_10003
Protein traN
CDS
NaN
MBURv2
875
1594
NaN
NaN
3848.611111
1462.500000
1923.611111
1318.055556
3047.222222
1848.611111
5036.111111
1826.388889
6
MBURv2_10004
Protein TraM
CDS
traM
MBURv2
1631
2071
NaN
NaN
873.015873
403.628118
482.993197
396.825397
755.102041
498.866213
1170.068027
419.501134
7
MBURv2_10005
Protein TraL
CDS
traL
MBURv2
2071
2796
NaN
NaN
758.953168
347.107438
382.920110
285.123967
621.212121
426.997245
1023.415978
457.300275
8
MBURv2_10006
Protein TraK
CDS
traK
MBURv2
2796
3176
NaN
NaN
1238.845144
430.446194
435.695538
669.291339
703.412073
724.409449
1629.921260
797.900262
9
MBURv2_10007
Protein TraJ
CDS
traJ
MBURv2
3508
3876
NaN
NaN
685.636856
243.902439
219.512195
368.563686
336.043360
390.243902
831.978320
439.024390
10
MBURv2_10008
Protein TraI
CDS
traI
MBURv2
3911
6112
NaN
NaN
394.187103
152.134423
170.753860
189.373297
261.126249
210.717530
473.660309
195.277021
11
MBURv2_10009
Conjugal transfer protein TraG
CDS
traG
MBURv2
6117
8018
NaN
NaN
504.731861
223.449001
227.655100
228.180862
404.837014
305.467928
675.078864
282.860147
12
MBURv2_10010
Plasmid transfer protein TraF
CDS
traF
MBURv2
8015
8548
NaN
NaN
325.842697
108.614232
177.902622
104.868914
273.408240
147.940075
323.970037
129.213483
13
MBURv2_10011
DNA topoisomerase 3
CDS
topB
MBURv2
8558
10627
NaN
NaN
152.173913
70.531401
93.719807
67.149758
117.874396
91.787440
204.830918
71.980676
14
MBURv2_10012
TraD protein
CDS
NaN
MBURv2
10635
10853
NaN
NaN
118.721461
91.324201
77.625571
59.360731
123.287671
77.625571
292.237443
54.794521
15
MBURv2_10013
DNA primase traC (Modular protein)
CDS
NaN
MBURv2
10856
18922
NaN
NaN
170.819388
86.773274
89.128548
76.236519
126.069171
96.566258
248.791372
94.954754
16
MBURv2_10014
conserved protein of unknown function
CDS
NaN
MBURv2
19300
19908
Evidence 4 : Homologs of previously reported g...
NaN
696.223317
315.270936
395.730706
270.935961
499.178982
311.986864
737.274220
303.776683
17
MBURv2_10015
Glutathione-dependent formaldehyde-activating,...
CDS
NaN
MBURv2
19973
20371
NaN
NaN
897.243108
328.320802
403.508772
293.233083
586.466165
353.383459
789.473684
275.689223
18
MBURv2_10016
putative antioxidant peroxiredoxin-related pro...
CDS
NaN
MBURv2
20447
21028
Evidence 3 : Function proposed based on presen...
NaN
788.659794
295.532646
381.443299
261.168385
501.718213
336.769759
805.841924
367.697595
19
MBURv2_10017
conserved protein of unknown function
CDS
NaN
MBURv2
21067
21537
Evidence 4 : Homologs of previously reported g...
NaN
1664.543524
602.972399
694.267516
526.539278
1099.787686
615.711253
1191.082803
624.203822
20
MBURv2_10018
Pyridoxamine 5'-phosphate oxidase-related FMN-...
CDS
NaN
MBURv2
21622
22173
NaN
NaN
807.971014
414.855072
541.666667
478.260870
780.797101
643.115942
1009.057971
596.014493
21
MBURv2_10019
protein of unknown function
CDS
NaN
MBURv2
22227
22721
Evidence 5 : No homology to any previously rep...
NaN
630.303030
280.808081
371.717172
222.222222
513.131313
331.313131
747.474747
250.505051
22
MBURv2_10020
Methyltransferase type 11
CDS
NaN
MBURv2
22801
23409
NaN
NaN
817.733990
390.804598
459.770115
341.543514
686.371100
604.269294
837.438424
476.190476
23
MBURv2_10021
conserved membrane protein of unknown function
CDS
NaN
MBURv2
23485
24141
Evidence 4 : Homologs of previously reported g...
NaN
1010.654490
360.730594
550.989346
415.525114
908.675799
671.232877
1117.199391
744.292237
24
MBURv2_10022
Methyltransferase domain family
CDS
NaN
MBURv2
24146
24862
NaN
NaN
3274.755927
1326.359833
1605.299861
1606.694561
2488.145049
2432.357043
4133.891213
2217.573222
25
MBURv2_10023
Plasmid stabilization system
CDS
NaN
MBURv2
25669
25962
NaN
NaN
1795.918367
792.517007
748.299320
894.557823
1391.156463
1068.027211
2353.741497
877.551020
26
MBURv2_10024
conserved protein of unknown function
CDS
NaN
MBURv2
25950
26222
Evidence 4 : Homologs of previously reported g...
NaN
3091.575092
1315.018315
1714.285714
1351.648352
2073.260073
1798.534799
4084.249084
1736.263736
27
MBURv2_10025
Phage integrase
CDS
NaN
MBURv2
26419
27387
NaN
NaN
724.458204
295.149639
383.900929
235.294118
559.339525
356.037152
961.816305
280.701754
28
MBURv2_10026
protein of unknown function
CDS
NaN
MBURv2
27445
27723
Evidence 5 : No homology to any previously rep...
NaN
1139.784946
494.623656
598.566308
455.197133
795.698925
541.218638
1562.724014
526.881720
29
MBURv2_10027
O-methyltransferase family 2
CDS
NaN
MBURv2
27772
28890
NaN
NaN
621.983914
288.650581
328.865058
226.094727
484.361037
286.863271
775.692583
264.521895
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4563
MBURv2_tRNA22
Ile tRNA
tRNA
NaN
MBURv2
3349301
3349377
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4564
MBURv2_tRNA23
Ala tRNA
tRNA
NaN
MBURv2
3349154
3349229
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4565
MBURv2_tRNA24
Arg tRNA
tRNA
NaN
MBURv2
3031974
3032050
NaN
NaN
1779.220779
1259.740260
961.038961
1103.896104
1909.090909
1727.272727
3753.246753
1415.584416
4566
MBURv2_tRNA25
Ile tRNA
tRNA
NaN
MBURv2
2890542
2890618
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4567
MBURv2_tRNA26
Ala tRNA
tRNA
NaN
MBURv2
2890395
2890470
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4568
MBURv2_tRNA27
Gln tRNA
tRNA
NaN
MBURv2
2672017
2672091
NaN
NaN
44986.666667
18840.000000
21933.333333
23826.666667
54413.333333
41986.666667
98586.666667
43266.666667
4569
MBURv2_tRNA28
Tyr tRNA
tRNA
NaN
MBURv2
2670122
2670206
NaN
NaN
62988.235294
38588.235294
32894.117647
38517.647059
70223.529412
64611.764706
111141.176471
66305.882353
4570
MBURv2_tRNA29
Gly tRNA
tRNA
NaN
MBURv2
2670028
2670101
NaN
NaN
90500.000000
40364.864865
51513.513514
40081.081081
104472.972973
70337.837838
96243.243243
69770.270270
4571
MBURv2_tRNA3
Asp tRNA
tRNA
NaN
MBURv2
113588
113664
NaN
NaN
17974.025974
14233.766234
11493.506494
12792.207792
15493.506494
16480.519481
61623.376623
16883.116883
4572
MBURv2_tRNA30
Thr tRNA
tRNA
NaN
MBURv2
2669935
2670010
NaN
NaN
108986.842105
48368.421053
60907.894737
45684.210526
120013.157895
80789.473684
168118.421053
83105.263158
4573
MBURv2_tRNA31
Trp tRNA
tRNA
NaN
MBURv2
2668594
2668669
NaN
NaN
50263.157895
22342.105263
25105.263158
22750.000000
61315.789474
33894.736842
72460.526316
39710.526316
4574
MBURv2_tRNA32
Met tRNA
tRNA
NaN
MBURv2
2370229
2370305
NaN
NaN
6272.727273
2350.649351
3298.701299
2636.363636
6883.116883
3948.051948
10961.038961
4181.818182
4575
MBURv2_tRNA33
Ala tRNA
tRNA
NaN
MBURv2
2259751
2259826
NaN
NaN
45342.105263
48131.578947
31289.473684
48052.631579
60368.421053
102710.526316
114947.368421
79421.052632
4576
MBURv2_tRNA34
Glu tRNA
tRNA
NaN
MBURv2
2259657
2259732
NaN
NaN
30184.210526
32407.894737
25118.421053
31236.842105
42052.631579
65750.000000
89486.842105
52868.421053
4577
MBURv2_tRNA35
Glu tRNA
tRNA
NaN
MBURv2
2259528
2259603
NaN
NaN
13236.842105
30039.473684
13000.000000
30092.105263
25486.842105
60039.473684
77302.631579
44657.894737
4578
MBURv2_tRNA36
Val tRNA
tRNA
NaN
MBURv2
1849580
1849656
NaN
NaN
14883.116883
8116.883117
7987.012987
8064.935065
17038.961039
18376.623377
26948.051948
15233.766234
4579
MBURv2_tRNA37
Pro tRNA
tRNA
NaN
MBURv2
1841903
1841979
NaN
NaN
11571.428571
6922.077922
6753.246753
6649.350649
10220.779221
8000.000000
17454.545455
7623.376623
4580
MBURv2_tRNA38
Ser tRNA
tRNA
NaN
MBURv2
1721090
1721181
NaN
NaN
34271.739130
12630.434783
18858.695652
14119.565217
34760.869565
18467.391304
59565.217391
18119.565217
4581
MBURv2_tRNA39
Arg tRNA
tRNA
NaN
MBURv2
1720933
1721009
NaN
NaN
19545.454545
12545.454545
12038.961039
15090.909091
20636.363636
22558.441558
65441.558442
24571.428571
4582
MBURv2_tRNA4
Asp tRNA
tRNA
NaN
MBURv2
364245
364321
NaN
NaN
1454.545455
350.649351
740.259740
389.610390
1506.493506
545.454545
1649.350649
415.584416
4583
MBURv2_tRNA40
Asn tRNA
tRNA
NaN
MBURv2
1720839
1720914
NaN
NaN
3263.157895
3947.368421
2263.157895
5144.736842
3539.473684
7657.894737
17289.473684
9868.421053
4584
MBURv2_tRNA41
Leu tRNA
tRNA
NaN
MBURv2
1364317
1364401
NaN
NaN
12094.117647
8247.058824
6400.000000
8176.470588
12588.235294
11741.176471
26294.117647
11411.764706
4585
MBURv2_tRNA42
Gly tRNA
tRNA
NaN
MBURv2
1295010
1295084
NaN
NaN
4053.333333
2893.333333
2920.000000
3373.333333
4986.666667
4706.666667
15920.000000
5653.333333
4586
MBURv2_tRNA43
Cys tRNA
tRNA
NaN
MBURv2
1294889
1294962
NaN
NaN
959.459459
756.756757
932.432432
567.567568
1000.000000
1283.783784
3270.270270
1824.324324
4587
MBURv2_tRNA44
Leu tRNA
tRNA
NaN
MBURv2
688937
689023
NaN
NaN
873.563218
459.770115
494.252874
540.229885
1045.977011
1080.459770
2137.931034
1034.482759
4588
MBURv2_tRNA5
Met tRNA
tRNA
NaN
MBURv2
1082864
1082940
NaN
NaN
7792.207792
6233.766234
4441.558442
6051.948052
9883.116883
10701.298701
22415.584416
9883.116883
4589
MBURv2_tRNA6
Ser tRNA
tRNA
NaN
MBURv2
1324017
1324107
NaN
NaN
6956.043956
5681.318681
4417.582418
5450.549451
8615.384615
11186.813187
12692.307692
9868.131868
4590
MBURv2_tRNA7
Leu tRNA
tRNA
NaN
MBURv2
2096904
2096990
NaN
NaN
7954.022989
3954.022989
4000.000000
3494.252874
8977.011494
6643.678161
15264.367816
6390.804598
4591
MBURv2_tRNA8
Ser tRNA
tRNA
NaN
MBURv2
2881884
2881971
NaN
NaN
2295.454545
5465.909091
1318.181818
6306.818182
2193.181818
11568.181818
16181.818182
8943.181818
4592
MBURv2_tRNA9
Thr tRNA
tRNA
NaN
MBURv2
3193201
3193276
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4593 rows × 17 columns
In [16]:
data_4.iloc[:,9:17].sum() #the sum of reads normalized to gene length is different between samples.
Out[16]:
5GB1_FM40_T0m_TR2 34037892.982884
5GB1_FM40_T10m_TR3 16540910.063731
5GB1_FM40_T20m_TR2 19517215.620340
5GB1_FM40_T40m_TR1 15187260.790192
5GB1_FM40_T60m_TR1 29189141.471211
5GB1_FM40_T90m_TR2 18386004.046463
5GB1_FM40_T150m_TR1_remake 45212818.536209
5GB1_FM40_T180m_TR1 19180042.089329
dtype: float64
In [17]:
norm_sum = data_4.iloc[:,9:17].sum(axis=0)/1000000 #creating a series with the sums of each FM40 column / 1,000,000
norm_sum = pd.Series.to_frame(norm_sum) #converting this series into a dataframe
norm_sum = norm_sum.T #transposing the dataframe so that there is one value per column
In [18]:
norm_sum
Out[18]:
5GB1_FM40_T0m_TR2
5GB1_FM40_T10m_TR3
5GB1_FM40_T20m_TR2
5GB1_FM40_T40m_TR1
5GB1_FM40_T60m_TR1
5GB1_FM40_T90m_TR2
5GB1_FM40_T150m_TR1_remake
5GB1_FM40_T180m_TR1
0
34.037893
16.54091
19.517216
15.187261
29.189141
18.386004
45.212819
19.180042
In [19]:
TPM = data_4.iloc[:,9:17].div(norm_sum.ix[0]) #dividing FM40 columns by the the total transcript counts in each repicate
In [20]:
data_5 = pd.concat([first_8, TPM], axis = 1) #this is the TPM!
In [21]:
data_5.iloc[:,9:17].sum() # can check that the sum total of each colum is identical. Now can do stats!
Out[21]:
5GB1_FM40_T0m_TR2 1000000
5GB1_FM40_T10m_TR3 1000000
5GB1_FM40_T20m_TR2 1000000
5GB1_FM40_T40m_TR1 1000000
5GB1_FM40_T60m_TR1 1000000
5GB1_FM40_T90m_TR2 1000000
5GB1_FM40_T150m_TR1_remake 1000000
5GB1_FM40_T180m_TR1 1000000
dtype: float64
In [22]:
data_5
Out[22]:
locus_tag
product
type
gene_symbol
locus
start_coord
end_coord
note
translation
5GB1_FM40_T0m_TR2
5GB1_FM40_T10m_TR3
5GB1_FM40_T20m_TR2
5GB1_FM40_T40m_TR1
5GB1_FM40_T60m_TR1
5GB1_FM40_T90m_TR2
5GB1_FM40_T150m_TR1_remake
5GB1_FM40_T180m_TR1
0
MBURv2_100001
conserved protein of unknown function
CDS
NaN
MBURv2
1965161
1965952
Evidence 4 : Homologs of previously reported g...
NaN
24.037381
19.388720
19.472578
20.618024
22.190693
20.189929
23.793199
21.197329
1
MBURv2_100002
conserved protein of unknown function
CDS
NaN
MBURv2
1966190
1966369
Evidence 4 : Homologs of previously reported g...
NaN
7.344755
6.717352
3.700437
5.487055
3.996920
5.438920
4.915027
4.634447
2
MBURv2_100003
protein of unknown function
CDS
NaN
MBURv2
1966931
1967041
Evidence 5 : No homology to any previously rep...
NaN
5.293517
3.812551
8.308673
2.372780
3.703710
2.939957
5.977735
0.939415
3
MBURv2_10001
protein of unknown function
CDS
NaN
MBURv2
116
289
Evidence 5 : No homology to any previously rep...
NaN
14.858356
21.889301
17.962332
17.028791
13.191805
11.878101
11.058811
7.491024
4
MBURv2_10002
KfrB
CDS
kfrB
MBURv2
497
844
NaN
NaN
89.572248
87.557206
79.063709
99.523823
76.788120
104.245961
95.143906
105.923083
5
MBURv2_10003
Protein traN
CDS
NaN
MBURv2
875
1594
NaN
NaN
113.068430
88.417142
98.559710
86.786918
104.395747
100.544474
111.386799
95.223404
6
MBURv2_10004
Protein TraM
CDS
traM
MBURv2
1631
2071
NaN
NaN
25.648352
24.401808
24.747034
26.128833
25.869279
27.132933
25.879122
21.871753
7
MBURv2_10005
Protein TraL
CDS
traL
MBURv2
2071
2796
NaN
NaN
22.297302
20.984785
19.619608
18.773890
21.282302
23.224037
22.635527
23.842506
8
MBURv2_10006
Protein TraK
CDS
traK
MBURv2
2796
3176
NaN
NaN
36.396059
26.023126
22.323652
44.069260
24.098416
39.400048
36.049981
41.600548
9
MBURv2_10007
Protein TraJ
CDS
traJ
MBURv2
3508
3876
NaN
NaN
20.143340
14.745406
11.247106
24.267950
11.512615
21.225053
18.401381
22.889647
10
MBURv2_10008
Protein TraI
CDS
traI
MBURv2
3911
6112
NaN
NaN
11.580831
9.197464
8.748884
12.469220
8.946006
11.460757
10.476239
10.181261
11
MBURv2_10009
Conjugal transfer protein TraG
CDS
traG
MBURv2
6117
8018
NaN
NaN
14.828528
13.508870
11.664323
15.024491
13.869439
16.614155
14.931139
14.747629
12
MBURv2_10010
Plasmid transfer protein TraF
CDS
traF
MBURv2
8015
8548
NaN
NaN
9.572940
6.566400
9.115164
6.905058
9.366779
8.046342
7.165447
6.736872
13
MBURv2_10011
DNA topoisomerase 3
CDS
topB
MBURv2
8558
10627
NaN
NaN
4.470721
4.264058
4.801905
4.421453
4.038296
4.992245
4.530373
3.752895
14
MBURv2_10012
TraD protein
CDS
NaN
MBURv2
10635
10853
NaN
NaN
3.487920
5.521111
3.977287
3.908587
4.223751
4.221992
6.463597
2.856851
15
MBURv2_10013
DNA primase traC (Modular protein)
CDS
NaN
MBURv2
10856
18922
NaN
NaN
5.018507
5.245979
4.566663
5.019768
4.319043
5.252161
5.502673
4.950706
16
MBURv2_10014
conserved protein of unknown function
CDS
NaN
MBURv2
19300
19908
Evidence 4 : Homologs of previously reported g...
NaN
20.454360
19.060072
20.275982
17.839686
17.101530
16.968715
16.306752
15.838166
17
MBURv2_10015
Glutathione-dependent formaldehyde-activating,...
CDS
NaN
MBURv2
19973
20371
NaN
NaN
26.360125
19.849017
20.674505
19.307832
20.091929
19.220243
17.461280
14.373755
18
MBURv2_10016
putative antioxidant peroxiredoxin-related pro...
CDS
NaN
MBURv2
20447
21028
Evidence 3 : Function proposed based on presen...
NaN
23.170053
17.866771
19.543940
17.196543
17.188522
18.316637
17.823307
19.170844
19
MBURv2_10017
conserved protein of unknown function
CDS
NaN
MBURv2
21067
21537
Evidence 4 : Homologs of previously reported g...
NaN
48.902660
36.453399
35.572057
34.669799
37.677973
33.488041
26.343918
32.544445
20
MBURv2_10018
Pyridoxamine 5'-phosphate oxidase-related FMN-...
CDS
NaN
MBURv2
21622
22173
NaN
NaN
23.737398
25.080547
27.753276
31.490924
26.749574
34.978560
22.317962
31.074723
21
MBURv2_10019
protein of unknown function
CDS
NaN
MBURv2
22227
22721
Evidence 5 : No homology to any previously rep...
NaN
18.517686
16.976580
19.045605
14.632146
17.579527
18.019855
16.532363
13.060714
22
MBURv2_10020
Methyltransferase type 11
CDS
NaN
MBURv2
22801
23409
NaN
NaN
24.024225
23.626548
23.557157
22.488816
23.514604
32.865722
18.522146
24.827395
23
MBURv2_10021
conserved membrane protein of unknown function
CDS
NaN
MBURv2
23485
24141
Evidence 4 : Homologs of previously reported g...
NaN
29.692040
21.808389
28.230940
27.360109
31.130611
36.507817
24.709793
38.805558
24
MBURv2_10022
Methyltransferase domain family
CDS
NaN
MBURv2
24146
24862
NaN
NaN
96.209126
80.186630
82.250455
105.792255
85.242146
132.293947
91.431841
115.618788
25
MBURv2_10023
Plasmid stabilization system
CDS
NaN
MBURv2
25669
25962
NaN
NaN
52.762325
47.912540
38.340475
58.901854
47.660068
58.089143
52.059163
45.753342
26
MBURv2_10024
conserved protein of unknown function
CDS
NaN
MBURv2
25950
26222
Evidence 4 : Homologs of previously reported g...
NaN
90.827452
79.500965
87.834543
88.998824
71.028470
97.820864
90.333875
90.524501
27
MBURv2_10025
Phage integrase
CDS
NaN
MBURv2
26419
27387
NaN
NaN
21.283873
17.843615
19.669862
15.492861
19.162589
19.364575
21.273089
14.635096
28
MBURv2_10026
protein of unknown function
CDS
NaN
MBURv2
27445
27723
Evidence 5 : No homology to any previously rep...
NaN
33.485767
29.903050
30.668632
29.972300
27.260100
29.436447
34.563738
27.470311
29
MBURv2_10027
O-methyltransferase family 2
CDS
NaN
MBURv2
27772
28890
NaN
NaN
18.273279
17.450707
16.849999
14.887130
16.593877
15.602263
17.156475
13.791518
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
4563
MBURv2_tRNA22
Ile tRNA
tRNA
NaN
MBURv2
3349301
3349377
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4564
MBURv2_tRNA23
Ala tRNA
tRNA
NaN
MBURv2
3349154
3349229
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4565
MBURv2_tRNA24
Arg tRNA
tRNA
NaN
MBURv2
3031974
3032050
NaN
NaN
52.271766
76.159066
49.240577
72.685662
65.404147
93.944977
83.012890
73.805073
4566
MBURv2_tRNA25
Ile tRNA
tRNA
NaN
MBURv2
2890542
2890618
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4567
MBURv2_tRNA26
Ala tRNA
tRNA
NaN
MBURv2
2890395
2890470
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4568
MBURv2_tRNA27
Gln tRNA
tRNA
NaN
MBURv2
2672017
2672091
NaN
NaN
1321.664261
1138.994162
1123.794180
1568.858729
1864.163541
2283.621094
2180.502562
2255.817087
4569
MBURv2_tRNA28
Tyr tRNA
tRNA
NaN
MBURv2
2670122
2670206
NaN
NaN
1850.532738
2332.896748
1685.389878
2536.181316
2405.810033
3514.182013
2458.178456
3457.024862
4570
MBURv2_tRNA29
Gly tRNA
tRNA
NaN
MBURv2
2670028
2670101
NaN
NaN
2658.801473
2440.304960
2639.388451
2639.125095
3579.172518
3825.618534
2128.671610
3637.649487
4571
MBURv2_tRNA3
Asp tRNA
tRNA
NaN
MBURv2
113588
113664
NaN
NaN
528.059301
860.518930
588.890686
842.298553
530.796924
896.362224
1362.962510
880.243996
4572
MBURv2_tRNA30
Thr tRNA
tRNA
NaN
MBURv2
2669935
2670010
NaN
NaN
3201.926810
2924.169279
3120.726641
3008.061240
4111.568612
4394.074617
3718.379577
4332.903065
4573
MBURv2_tRNA31
Trp tRNA
tRNA
NaN
MBURv2
2668594
2668669
NaN
NaN
1476.682412
1350.718018
1286.313768
1497.965980
2100.636962
1843.507527
1602.654483
2070.408716
4574
MBURv2_tRNA32
Met tRNA
tRNA
NaN
MBURv2
2370229
2370305
NaN
NaN
184.286591
142.111247
169.014954
173.590463
235.810871
214.731376
242.432109
218.029667
4575
MBURv2_tRNA33
Ala tRNA
tRNA
NaN
MBURv2
2259751
2259826
NaN
NaN
1332.106699
2909.850713
1603.173029
3164.009115
2068.180769
5586.343071
2542.362369
4140.817432
4576
MBURv2_tRNA34
Glu tRNA
tRNA
NaN
MBURv2
2259657
2259732
NaN
NaN
886.782579
1959.257055
1286.987936
2056.779200
1440.694363
3576.089717
1979.236088
2756.428834
4577
MBURv2_tRNA35
Glu tRNA
tRNA
NaN
MBURv2
2259528
2259603
NaN
NaN
388.885473
1816.071399
666.078618
1981.404394
873.161759
3265.498775
1709.750334
2328.352280
4578
MBURv2_tRNA36
Val tRNA
tRNA
NaN
MBURv2
1849580
1849656
NaN
NaN
437.251415
490.715631
409.229121
531.032895
583.743138
999.489793
596.026809
794.250928
4579
MBURv2_tRNA37
Pro tRNA
tRNA
NaN
MBURv2
1841903
1841979
NaN
NaN
339.957252
418.482290
346.014866
437.824223
350.156898
435.113578
386.053027
397.464020
4580
MBURv2_tRNA38
Ser tRNA
tRNA
NaN
MBURv2
1721090
1721181
NaN
NaN
1006.870171
763.587658
966.259533
929.697950
1190.883589
1004.426588
1317.440923
944.709356
4581
MBURv2_tRNA39
Arg tRNA
tRNA
NaN
MBURv2
1720933
1721009
NaN
NaN
574.226335
758.450079
616.838040
993.655755
706.987688
1226.935527
1447.411609
1281.093569
4582
MBURv2_tRNA4
Asp tRNA
tRNA
NaN
MBURv2
364245
364321
NaN
NaN
42.733123
21.198915
37.928553
25.653763
51.611436
29.666835
36.479713
21.667545
4583
MBURv2_tRNA40
Asn tRNA
tRNA
NaN
MBURv2
1720839
1720914
NaN
NaN
95.868387
238.642759
115.957006
338.753440
121.259945
416.506747
382.402032
514.515088
4584
MBURv2_tRNA41
Leu tRNA
tRNA
NaN
MBURv2
1364317
1364401
NaN
NaN
355.313346
498.585555
327.915627
538.376913
431.264322
638.593163
581.563338
594.981213
4585
MBURv2_tRNA42
Gly tRNA
tRNA
NaN
MBURv2
1295010
1295084
NaN
NaN
119.082968
174.919839
149.611505
222.115981
170.839785
255.991822
352.112532
294.750830
4586
MBURv2_tRNA43
Cys tRNA
tRNA
NaN
MBURv2
1294889
1294962
NaN
NaN
28.187980
45.750612
47.774870
37.371293
34.259315
69.823969
72.330600
95.115762
4587
MBURv2_tRNA44
Leu tRNA
tRNA
NaN
MBURv2
688937
689023
NaN
NaN
25.664433
27.795938
25.323944
35.571252
35.834456
58.765340
47.285949
53.935375
4588
MBURv2_tRNA5
Met tRNA
tRNA
NaN
MBURv2
1082864
1082940
NaN
NaN
228.927443
376.869604
227.571316
398.488453
338.588817
582.035045
495.779408
515.281293
4589
MBURv2_tRNA6
Ser tRNA
tRNA
NaN
MBURv2
1324017
1324107
NaN
NaN
204.361767
343.470744
226.342861
358.889567
295.157178
608.441789
280.723655
514.500011
4590
MBURv2_tRNA7
Leu tRNA
tRNA
NaN
MBURv2
2096904
2096990
NaN
NaN
233.681415
239.045069
204.947267
230.077887
307.546267
361.344322
337.611507
333.200760
4591
MBURv2_tRNA8
Ser tRNA
tRNA
NaN
MBURv2
2881884
2881971
NaN
NaN
67.438209
330.447906
67.539440
415.270289
75.136907
629.184122
357.903327
466.275401
4592
MBURv2_tRNA9
Thr tRNA
tRNA
NaN
MBURv2
3193201
3193276
NaN
NaN
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
4593 rows × 17 columns
In [ ]:
#can individualy identify 1 column into a new dataframe
df = data["locus_tag"]
df
In [ ]:
# for multiple column selection must use the __getitem__ syntax []
df1 = data[["locus_tag","type"]]
df1
#Dont want to manually enter all the column names
In [ ]:
list(data.columns)
In [ ]:
print(data.columns.get_loc("translation"))
print(data.columns.get_loc("5GB1_FM40_T0m_TR2"))
print(data.columns.get_loc("5GB1_FM40_T90m_TR2_QC"))
In [ ]:
columns_list = list(range(0,9)) + list(range(20,42))
In [ ]:
columns_list
In [ ]:
data_1 = data.iloc[:, columns_list] #slicing the column index the way I want.
In [ ]:
first_8 = data_1.iloc[:, 0:9] #new data frame with first 9 columns
remaining_data = data_1.iloc[:,9:] # new data frame with remaining columns (to be sorted)
remaining_data
In [ ]:
sorted(remaining_data.columns) #sorted is a python function that sorts your input (dont know by what criteria)
In [ ]:
list(sorted(remaining_data.columns,key=str)) # will list sorted columns, but doesnt sort this naturally (150 before 40)
In [ ]:
import natsort as ns #3rd party package for natural sorting
list(ns.natsorted(remaining_data.columns)) #this works!
In [ ]:
cols=list(ns.natsorted(remaining_data.columns)) #this works!
In [ ]:
remaining_data[cols].head()
In [ ]:
newdf=remaining_data[cols]
In [ ]:
newdf.head()
In [ ]:
ns.natsorted(remaining_data)#the problem with this package is that passing the object as an argument
#returns a list, and I cant use that list for the dataframe, I need index.
In [ ]:
remaining_data.loc("5GB1_Cu_transition_tim")
In [ ]:
remaining_data.columns = ns.natsorted(remaining_data.columns)
In [ ]:
list(remaining_data.columns)
In [ ]:
remaining_data["5GB1_FM40_T150m_TR1_remake"]
In [ ]:
data["5GB1_FM40_T150m_TR1_remake"]
In [ ]:
#many columns with QC runs. Gotta filter those out.
list(data_2.columns)
In [ ]:
data_2.select(lambda x: not re.search("QC", x), axis = 1) #ok this is what I need, not lets break it down.
In [ ]:
# re is a regex (regular expression) module. It is useful in selecting strings or parts of strings. Here is an example.
str_1 = 'an example word:cat12, word:cat!!, word:cattt165'
match = re.search(r'word:cat\d+', str_1) #the r ignores slashes (google education has a nice tutorial with re module)
if match:
print("found", match.group())
else:
print("did not find")
In [ ]:
#lets find multiple words, the precious example on found 1
#say you have a text with many email addresses
str_2 = "purple alice@gmail.com, and many other like bob@yahoo.com and also a dishwasher"
emails = re.findall(r"\w+@\w+\.\w+", str_2)
for email in emails:
print (email)
In [ ]:
print(emails)
In [ ]:
data.ix[1:3] #supports mixed integer and label based access. It is
#primarily label based, but will fall back to integer positional
#access unless the corresponding axis is of integer type.
In [ ]:
data.loc[1] #returns the first row, Note: "1" is
#interpreted as a *label* of the index, and **never** as an
# integer position along the index).
data.loc[0:2,"5GB1_FM34_T0_TR1_QC"] #the integers are interpreted as names, not positions.
In [ ]:
data.iloc[1:3] #returns the data for axis 0 = 1,2,3 (or the first three rows of data)
data.iloc[3] #returns the data for the third row
data.iloc[[1,2,5]] #returns data for rows 1,2,5
In [ ]:
type(data.iloc[4])
Content source: gilmana/Cu_transition_time_course-
Similar notebooks: