In [1]:
import pandas as pd
df = pd.read_csv('observation_table.txt', sep='\t', header=0, index_col=0)
gene_to_genome_id_map = pd.read_csv('imgv400-gene-to-genome-id-map.txt', sep='\t', header=0, index_col=0)
genome_id_to_taxon_map = pd.read_csv('00.taxon.tab.txt', sep='\t', header=0, index_col=0)
In [2]:
df = df.merge(gene_to_genome_id_map, how='left', left_index=True, right_index=True).merge(genome_id_to_taxon_map, how='left', left_on='taxon_oid', right_index=True)
In [3]:
df
Out[3]:
SF2
EB024
KP1
PE6
TL1
AR3
BZ1
CL1
DF1
SV1
...
EB017
EB020
EB021
EB019
EB026
EC
taxon_oid
taxon_display_name
domain
seq_status
GeneID
637005554
0
0
0
0
0
0
0
0
0
2
...
0
0
0
0
0
2.7.9.1
637000321
Thermotoga maritima MSB8
Bacteria
Finished
637008225
0
0
0
0
0
0
0
2
0
0
...
0
0
0
0
0
2.2.1.1
637000126
Haemophilus influenzae Rd (KW20)
Bacteria
Finished
637010307
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
4
1.2.4.1
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637010313
2
0
0
2
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.49
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637010474
0
0
0
0
0
0
0
0
0
0
...
2
0
0
0
0
1.2.1.12
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637010856
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
3.1.3.11
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637011040
0
4
0
0
0
0
2
0
0
0
...
0
0
2
0
0
5.3.1.9
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637011596
0
0
0
0
0
0
0
0
0
0
...
0
0
0
2
2
1.1.1.44
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637011696
0
0
4
0
0
0
0
0
0
0
...
0
0
0
0
0
4.2.1.2
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637012460
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
2
4.2.1.3
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637012469
0
0
0
0
0
0
0
0
0
4
...
0
0
2
0
0
1.1.5.4
637000315
Synechocystis sp. PCC 6803
Bacteria
Finished
637020438
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
6.4.1.1
637000010
Aquifex aeolicus VF5
Bacteria
Finished
637020501
0
2
8
0
2
8
2
6
2
4
...
6
2
2
0
0
6.2.1.5
637000010
Aquifex aeolicus VF5
Bacteria
Finished
637021056
0
4
0
0
0
0
0
0
0
0
...
0
2
2
2
0
4.1.1.32
637000328
Treponema pallidum pallidum Nichols
Bacteria
Finished
637026084
0
0
0
0
0
0
0
2
0
0
...
0
0
0
0
0
2.3.3.1
637000173
Mycobacterium tuberculosis H37Rv (lab strain)
Bacteria
Finished
637027465
0
0
0
0
0
0
2
4
0
0
...
0
0
0
0
0
1.2.4.1
637000173
Mycobacterium tuberculosis H37Rv (lab strain)
Bacteria
Finished
637027564
0
0
0
2
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.38
637000173
Mycobacterium tuberculosis H37Rv (lab strain)
Bacteria
Finished
637034561
2
0
0
0
0
2
0
0
0
0
...
0
0
0
0
0
1.2.4.1
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637035298
0
0
0
0
0
0
0
0
0
0
...
2
4
0
0
2
4.1.1.49
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637035865
2
2
2
0
2
0
0
0
2
2
...
0
6
6
0
2
1.1.1.42
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637035920
0
6
0
0
0
0
0
0
0
6
...
4
0
4
22
0
1.1.1.44
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637036046
0
0
0
2
0
0
2
2
0
0
...
0
0
0
0
4
4.2.1.3
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637036068
0
2
2
4
0
4
4
2
0
0
...
0
2
0
0
0
5.3.1.9
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637036973
0
2
0
0
0
0
0
0
0
2
...
0
0
0
0
0
4.2.1.2
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637036982
0
0
0
0
0
0
0
0
0
0
...
0
2
2
0
0
4.2.1.11
637000092
Deinococcus radiodurans USUHS (R1)
Bacteria
Finished
637040460
0
2
0
0
0
0
0
0
0
0
...
0
0
0
0
0
4.1.1.49
637000055
Campylobacter jejuni jejuni NCTC 11168
Bacteria
Finished
637041743
0
0
0
0
0
0
0
0
0
0
...
0
2
0
0
0
4.1.2.13
637000068
Chlamydophila pneumoniae AR39
Bacteria
Finished
637042296
0
0
0
2
0
0
0
0
0
0
...
0
0
0
0
0
4.1.1.32
637000068
Chlamydophila pneumoniae AR39
Bacteria
Finished
637042673
0
0
0
0
0
0
0
0
0
2
...
0
0
0
0
0
5.3.1.9
637000348
Xylella fastidiosa 9a5c
Bacteria
Finished
637042900
0
0
0
0
2
0
0
0
0
0
...
0
0
0
0
0
1.2.1.12
637000348
Xylella fastidiosa 9a5c
Bacteria
Finished
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
651697149
0
0
0
0
0
0
0
0
0
0
...
2
0
0
0
0
4.2.1.3
651324086
Plautia stali symbiont
Bacteria
Draft
651697559
0
0
0
0
0
0
2
0
0
0
...
0
0
0
0
0
4.2.1.3
651324086
Plautia stali symbiont
Bacteria
Draft
651699128
0
0
0
0
0
0
0
0
0
0
...
0
0
2
0
0
1.2.1.12
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651699522
0
0
0
0
0
2
0
0
0
0
...
0
0
0
0
0
4.2.1.2
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651699902
0
2
0
0
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.42
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651701448
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
2
4.2.1.3
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651702170
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
2.3.1.12
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651702493
0
0
0
0
0
0
0
0
0
0
...
0
0
0
4
2
5.3.1.9
651324007
Aeromonas caviae Ae398
Bacteria
Draft
651705800
0
0
0
0
0
2
0
0
0
0
...
0
0
0
0
0
1.1.1.44
651703106
Synechococcus phage S-ShM2
Viruses
Finished
651706107
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.44
651703102
Synechococcus phage S-SM1
Viruses
Finished
651707677
0
0
0
0
0
0
0
2
0
0
...
0
0
0
0
0
1.1.1.44
651703070
Prochlorococcus phage Syn1
Viruses
Finished
651713815
2
0
0
0
0
2
0
0
0
2
...
0
0
0
0
0
4.1.1.31
650716067
Nitrosomonas sp. Is79A3
Bacteria
Finished
651716731
0
0
0
0
0
0
0
0
0
2
...
0
0
0
0
0
5.3.1.6
651716518
Streptomyces avermitilis
GFragment:Bacteria
Finished
651717076
2
0
0
2
0
2
0
0
0
4
...
0
4
0
0
0
1.2.7.3
651716542
Streptomyces carzinostaticus
GFragment:Bacteria
Finished
651717077
0
2
0
0
0
0
0
0
0
2
...
0
0
0
0
0
1.2.7.3
651716542
Streptomyces carzinostaticus
GFragment:Bacteria
Finished
651717479
0
0
0
0
0
2
0
0
0
0
...
0
0
0
0
0
2.2.1.1
651716560
Streptomyces kanamyceticus NBRC 13414
GFragment:Bacteria
Finished
651717570
0
0
0
0
0
0
0
0
2
0
...
0
0
0
0
0
2.7.1.11
651716564
Microcystis aeruginosa K-139
GFragment:Bacteria
Finished
651720835
0
2
0
0
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.44
651716760
Streptomyces sp. SCC 2136 = ATCC 55186
GFragment:Bacteria
Finished
651720836
0
4
0
0
0
0
0
0
0
0
...
0
0
0
0
0
5.3.1.9
651716760
Streptomyces sp. SCC 2136 = ATCC 55186
GFragment:Bacteria
Finished
651720838
0
0
0
2
0
0
0
0
0
2
...
0
0
0
0
0
1.1.1.49
651716760
Streptomyces sp. SCC 2136 = ATCC 55186
GFragment:Bacteria
Finished
651720840
0
2
0
0
0
0
0
0
0
0
...
0
2
0
0
0
2.2.1.1
651716760
Streptomyces sp. SCC 2136 = ATCC 55186
GFragment:Bacteria
Finished
651720844
0
0
0
0
2
0
0
0
0
4
...
0
0
0
0
0
1.2.7.3
651716760
Streptomyces sp. SCC 2136 = ATCC 55186
GFragment:Bacteria
Finished
651721133
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
1.1.1.42
651716766
Streptomyces fradiae DSM 40063
GFragment:Bacteria
Finished
651724379
0
0
0
0
0
0
0
0
0
0
...
0
4
4
0
0
6.2.1.5
651716963
Streptomyces sp. NRRL 30748
GFragment:Bacteria
Finished
651724989
0
0
2
0
0
0
0
0
0
2
...
4
4
0
0
0
1.2.7.3
651717000
Streptomyces vitaminophilus ATCC 31673
GFragment:Bacteria
Finished
651724990
0
0
0
2
0
0
0
0
2
0
...
0
0
0
2
0
1.2.7.3
651717000
Streptomyces vitaminophilus ATCC 31673
GFragment:Bacteria
Finished
651725021
0
0
2
0
0
0
0
0
0
2
...
0
2
0
0
0
1.2.7.3
651717001
Streptomyces sp. UC 11065
GFragment:Bacteria
Finished
651726727
6
2
4
0
0
0
0
0
0
2
...
0
0
0
0
0
4.1.2.13
651717086
Actinoplanes garbadinensis ATCC 31049
GFragment:Bacteria
Finished
651727775
8
0
6
4
2
0
4
0
0
14
...
2
2
0
0
0
2.7.9.2
651717142
Streptomyces chattanoogensis L10, CGMCC 2644
GFragment:Bacteria
Finished
651728126
0
6
0
0
0
0
0
0
2
0
...
2
2
0
2
2
4.2.1.3
651717162
Streptomyces viridochromogenes Tue494
GFragment:Bacteria
Finished
18521 rows × 21 columns
In [7]:
collapsed_df = df.groupby(['taxon_display_name', 'EC']).sum()
In [8]:
collapsed_df
Out[8]:
SF2
EB024
KP1
PE6
TL1
AR3
BZ1
CL1
DF1
SV1
MD3
EB017
EB020
EB021
EB019
EB026
taxon_oid
taxon_display_name
EC
Abiotrophia defectiva ATCC 49176
2.7.9.1
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
643886181
Acaryochloris marina MBIC11017
1.1.1.38
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
641228474
1.1.1.42
0
4
0
2
0
0
0
0
0
0
0
0
0
0
0
0
641228474
1.1.1.44
0
0
0
0
0
0
0
0
2
2
0
2
0
6
0
0
1282456948
1.1.1.49
0
2
0
2
2
0
0
0
0
0
0
0
0
4
0
4
1282456948
1.2.1.12
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
2
641228474
1.2.4.1
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
641228474
2.2.1.1
0
2
0
0
0
0
0
0
0
0
0
2
0
0
0
0
641228474
2.2.1.2
0
2
0
8
4
2
2
2
0
0
2
2
0
0
0
0
1282456948
2.3.3.1
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
641228474
2.7.1.40
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
641228474
2.7.9.2
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
641228474
3.1.3.11
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
6
1282456948
4.1.2.13
0
0
0
0
0
0
0
0
0
0
2
0
0
0
2
0
1282456948
4.2.1.11
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
2
641228474
4.2.1.2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
641228474
4.2.1.3
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
641228474
5.1.3.1
0
0
0
2
0
0
0
0
0
2
0
0
0
0
0
0
641228474
Acetobacter pasteurianus IFO 3283-01
1.1.1.42
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
644736321
1.2.4.1
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
644736321
1.2.4.2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
644736321
2.2.1.1
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
644736321
2.2.1.2
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
644736321
2.3.3.1
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
644736321
2.7.9.1
0
0
0
2
0
0
0
0
0
2
0
0
0
0
0
0
644736321
4.2.1.3
0
0
0
4
0
0
0
2
0
0
0
0
0
2
0
0
644736321
Acetobacter pasteurianus IFO 3283-01-42C
1.1.1.42
0
0
0
0
0
0
0
0
0
0
4
0
2
0
0
0
646862301
1.2.4.2
0
2
0
0
0
0
0
0
0
0
0
2
0
0
0
0
646862301
2.7.9.1
2
0
0
0
0
0
0
0
0
2
2
0
0
0
0
0
646862301
4.2.1.2
0
0
0
2
0
2
0
2
0
0
0
0
0
0
0
0
646862301
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
delta proteobacterium NaphS2
5.3.1.9
0
8
66
124
14
40
34
48
50
2
8
0
2
4
0
0
648276759
delta proteobacterium sp. MLMS-1
1.1.1.42
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
638341245
2.2.1.1
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
638341245
2.3.3.1
0
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
638341245
4.2.1.11
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
638341245
gamma proteobacterium IMCC1989
1.2.4.1
0
0
0
2
0
2
0
0
0
0
2
0
0
0
0
0
651324112
2.7.2.3
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
651324112
2.7.9.2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
651324112
4.1.1.3
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
651324112
4.2.1.2
0
0
0
2
0
0
0
0
2
0
0
0
0
0
0
0
651324112
4.2.1.3
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
651324112
6.4.1.1
0
2
0
0
2
0
2
2
2
2
0
0
0
0
0
4
651324112
gamma proteobacterium IMCC2047
1.1.1.38
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
651324113
1.1.1.40
0
0
2
0
0
2
0
0
0
0
0
2
0
0
0
0
651324113
4.2.1.2
0
0
0
0
0
0
0
0
2
2
0
0
0
0
0
0
651324113
gamma proteobacterium IMCC3088
1.1.1.40
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
651324114
1.1.1.42
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
651324114
1.2.4.1
2
2
0
0
0
0
0
0
0
2
0
0
0
0
0
0
651324114
1.2.7.3
0
16
0
2
0
2
0
0
2
2
4
8
8
4
0
2
1302648228
1.3.5.1
2
0
2
2
4
2
4
2
2
6
0
0
2
0
0
2
651324114
2.3.3.1
0
4
2
2
0
0
0
0
0
0
0
2
6
2
0
0
1302648228
4.1.2.13
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
651324114
4.2.1.2
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
651324114
6.2.1.5
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
651324114
gamma proteobacterium sp. HTCC5015
1.2.4.1
0
4
0
0
2
2
0
0
0
0
0
0
4
0
2
0
647533248
1.2.4.2
0
0
0
0
0
0
0
0
0
2
0
0
2
0
0
0
647533248
2.7.9.2
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
647533248
4.2.1.11
0
0
0
0
2
0
4
2
2
0
0
0
0
0
0
0
647533248
4.2.1.2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
647533248
6.2.1.5
0
0
0
2
0
0
0
0
2
0
0
0
0
0
0
0
1295066496
15120 rows × 17 columns
In [10]:
collapsed_df.to_csv('table-by-species-ec.tsv', sep='\t')
In [ ]:
Content source: gregcaporaso/sketchbook
Similar notebooks: