In [1]:
import pandas as pd

df = pd.read_csv('observation_table.txt', sep='\t', header=0, index_col=0)
gene_to_genome_id_map = pd.read_csv('imgv400-gene-to-genome-id-map.txt', sep='\t', header=0, index_col=0)
genome_id_to_taxon_map = pd.read_csv('00.taxon.tab.txt', sep='\t', header=0, index_col=0)

In [2]:
df = df.merge(gene_to_genome_id_map, how='left', left_index=True, right_index=True).merge(genome_id_to_taxon_map, how='left', left_on='taxon_oid', right_index=True)

In [3]:
df


Out[3]:
SF2 EB024 KP1 PE6 TL1 AR3 BZ1 CL1 DF1 SV1 ... EB017 EB020 EB021 EB019 EB026 EC taxon_oid taxon_display_name domain seq_status
GeneID
637005554 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 2.7.9.1 637000321 Thermotoga maritima MSB8 Bacteria Finished
637008225 0 0 0 0 0 0 0 2 0 0 ... 0 0 0 0 0 2.2.1.1 637000126 Haemophilus influenzae Rd (KW20) Bacteria Finished
637010307 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 4 1.2.4.1 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637010313 2 0 0 2 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.49 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637010474 0 0 0 0 0 0 0 0 0 0 ... 2 0 0 0 0 1.2.1.12 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637010856 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 3.1.3.11 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637011040 0 4 0 0 0 0 2 0 0 0 ... 0 0 2 0 0 5.3.1.9 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637011596 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 2 2 1.1.1.44 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637011696 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 4.2.1.2 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637012460 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 2 4.2.1.3 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637012469 0 0 0 0 0 0 0 0 0 4 ... 0 0 2 0 0 1.1.5.4 637000315 Synechocystis sp. PCC 6803 Bacteria Finished
637020438 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 6.4.1.1 637000010 Aquifex aeolicus VF5 Bacteria Finished
637020501 0 2 8 0 2 8 2 6 2 4 ... 6 2 2 0 0 6.2.1.5 637000010 Aquifex aeolicus VF5 Bacteria Finished
637021056 0 4 0 0 0 0 0 0 0 0 ... 0 2 2 2 0 4.1.1.32 637000328 Treponema pallidum pallidum Nichols Bacteria Finished
637026084 0 0 0 0 0 0 0 2 0 0 ... 0 0 0 0 0 2.3.3.1 637000173 Mycobacterium tuberculosis H37Rv (lab strain) Bacteria Finished
637027465 0 0 0 0 0 0 2 4 0 0 ... 0 0 0 0 0 1.2.4.1 637000173 Mycobacterium tuberculosis H37Rv (lab strain) Bacteria Finished
637027564 0 0 0 2 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.38 637000173 Mycobacterium tuberculosis H37Rv (lab strain) Bacteria Finished
637034561 2 0 0 0 0 2 0 0 0 0 ... 0 0 0 0 0 1.2.4.1 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637035298 0 0 0 0 0 0 0 0 0 0 ... 2 4 0 0 2 4.1.1.49 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637035865 2 2 2 0 2 0 0 0 2 2 ... 0 6 6 0 2 1.1.1.42 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637035920 0 6 0 0 0 0 0 0 0 6 ... 4 0 4 22 0 1.1.1.44 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637036046 0 0 0 2 0 0 2 2 0 0 ... 0 0 0 0 4 4.2.1.3 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637036068 0 2 2 4 0 4 4 2 0 0 ... 0 2 0 0 0 5.3.1.9 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637036973 0 2 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 4.2.1.2 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637036982 0 0 0 0 0 0 0 0 0 0 ... 0 2 2 0 0 4.2.1.11 637000092 Deinococcus radiodurans USUHS (R1) Bacteria Finished
637040460 0 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 4.1.1.49 637000055 Campylobacter jejuni jejuni NCTC 11168 Bacteria Finished
637041743 0 0 0 0 0 0 0 0 0 0 ... 0 2 0 0 0 4.1.2.13 637000068 Chlamydophila pneumoniae AR39 Bacteria Finished
637042296 0 0 0 2 0 0 0 0 0 0 ... 0 0 0 0 0 4.1.1.32 637000068 Chlamydophila pneumoniae AR39 Bacteria Finished
637042673 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 5.3.1.9 637000348 Xylella fastidiosa 9a5c Bacteria Finished
637042900 0 0 0 0 2 0 0 0 0 0 ... 0 0 0 0 0 1.2.1.12 637000348 Xylella fastidiosa 9a5c Bacteria Finished
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
651697149 0 0 0 0 0 0 0 0 0 0 ... 2 0 0 0 0 4.2.1.3 651324086 Plautia stali symbiont Bacteria Draft
651697559 0 0 0 0 0 0 2 0 0 0 ... 0 0 0 0 0 4.2.1.3 651324086 Plautia stali symbiont Bacteria Draft
651699128 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 1.2.1.12 651324007 Aeromonas caviae Ae398 Bacteria Draft
651699522 0 0 0 0 0 2 0 0 0 0 ... 0 0 0 0 0 4.2.1.2 651324007 Aeromonas caviae Ae398 Bacteria Draft
651699902 0 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.42 651324007 Aeromonas caviae Ae398 Bacteria Draft
651701448 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 2 4.2.1.3 651324007 Aeromonas caviae Ae398 Bacteria Draft
651702170 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 2.3.1.12 651324007 Aeromonas caviae Ae398 Bacteria Draft
651702493 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 4 2 5.3.1.9 651324007 Aeromonas caviae Ae398 Bacteria Draft
651705800 0 0 0 0 0 2 0 0 0 0 ... 0 0 0 0 0 1.1.1.44 651703106 Synechococcus phage S-ShM2 Viruses Finished
651706107 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.44 651703102 Synechococcus phage S-SM1 Viruses Finished
651707677 0 0 0 0 0 0 0 2 0 0 ... 0 0 0 0 0 1.1.1.44 651703070 Prochlorococcus phage Syn1 Viruses Finished
651713815 2 0 0 0 0 2 0 0 0 2 ... 0 0 0 0 0 4.1.1.31 650716067 Nitrosomonas sp. Is79A3 Bacteria Finished
651716731 0 0 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 5.3.1.6 651716518 Streptomyces avermitilis GFragment:Bacteria Finished
651717076 2 0 0 2 0 2 0 0 0 4 ... 0 4 0 0 0 1.2.7.3 651716542 Streptomyces carzinostaticus GFragment:Bacteria Finished
651717077 0 2 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 1.2.7.3 651716542 Streptomyces carzinostaticus GFragment:Bacteria Finished
651717479 0 0 0 0 0 2 0 0 0 0 ... 0 0 0 0 0 2.2.1.1 651716560 Streptomyces kanamyceticus NBRC 13414 GFragment:Bacteria Finished
651717570 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 2.7.1.11 651716564 Microcystis aeruginosa K-139 GFragment:Bacteria Finished
651720835 0 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.44 651716760 Streptomyces sp. SCC 2136 = ATCC 55186 GFragment:Bacteria Finished
651720836 0 4 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 5.3.1.9 651716760 Streptomyces sp. SCC 2136 = ATCC 55186 GFragment:Bacteria Finished
651720838 0 0 0 2 0 0 0 0 0 2 ... 0 0 0 0 0 1.1.1.49 651716760 Streptomyces sp. SCC 2136 = ATCC 55186 GFragment:Bacteria Finished
651720840 0 2 0 0 0 0 0 0 0 0 ... 0 2 0 0 0 2.2.1.1 651716760 Streptomyces sp. SCC 2136 = ATCC 55186 GFragment:Bacteria Finished
651720844 0 0 0 0 2 0 0 0 0 4 ... 0 0 0 0 0 1.2.7.3 651716760 Streptomyces sp. SCC 2136 = ATCC 55186 GFragment:Bacteria Finished
651721133 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1.1.1.42 651716766 Streptomyces fradiae DSM 40063 GFragment:Bacteria Finished
651724379 0 0 0 0 0 0 0 0 0 0 ... 0 4 4 0 0 6.2.1.5 651716963 Streptomyces sp. NRRL 30748 GFragment:Bacteria Finished
651724989 0 0 2 0 0 0 0 0 0 2 ... 4 4 0 0 0 1.2.7.3 651717000 Streptomyces vitaminophilus ATCC 31673 GFragment:Bacteria Finished
651724990 0 0 0 2 0 0 0 0 2 0 ... 0 0 0 2 0 1.2.7.3 651717000 Streptomyces vitaminophilus ATCC 31673 GFragment:Bacteria Finished
651725021 0 0 2 0 0 0 0 0 0 2 ... 0 2 0 0 0 1.2.7.3 651717001 Streptomyces sp. UC 11065 GFragment:Bacteria Finished
651726727 6 2 4 0 0 0 0 0 0 2 ... 0 0 0 0 0 4.1.2.13 651717086 Actinoplanes garbadinensis ATCC 31049 GFragment:Bacteria Finished
651727775 8 0 6 4 2 0 4 0 0 14 ... 2 2 0 0 0 2.7.9.2 651717142 Streptomyces chattanoogensis L10, CGMCC 2644 GFragment:Bacteria Finished
651728126 0 6 0 0 0 0 0 0 2 0 ... 2 2 0 2 2 4.2.1.3 651717162 Streptomyces viridochromogenes Tue494 GFragment:Bacteria Finished

18521 rows × 21 columns


In [7]:
collapsed_df = df.groupby(['taxon_display_name', 'EC']).sum()

In [8]:
collapsed_df


Out[8]:
SF2 EB024 KP1 PE6 TL1 AR3 BZ1 CL1 DF1 SV1 MD3 EB017 EB020 EB021 EB019 EB026 taxon_oid
taxon_display_name EC
Abiotrophia defectiva ATCC 49176 2.7.9.1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 643886181
Acaryochloris marina MBIC11017 1.1.1.38 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 641228474
1.1.1.42 0 4 0 2 0 0 0 0 0 0 0 0 0 0 0 0 641228474
1.1.1.44 0 0 0 0 0 0 0 0 2 2 0 2 0 6 0 0 1282456948
1.1.1.49 0 2 0 2 2 0 0 0 0 0 0 0 0 4 0 4 1282456948
1.2.1.12 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 641228474
1.2.4.1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 641228474
2.2.1.1 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 641228474
2.2.1.2 0 2 0 8 4 2 2 2 0 0 2 2 0 0 0 0 1282456948
2.3.3.1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 641228474
2.7.1.40 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 641228474
2.7.9.2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 641228474
3.1.3.11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 1282456948
4.1.2.13 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 0 1282456948
4.2.1.11 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 641228474
4.2.1.2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 641228474
4.2.1.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 641228474
5.1.3.1 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 641228474
Acetobacter pasteurianus IFO 3283-01 1.1.1.42 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 644736321
1.2.4.1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 644736321
1.2.4.2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 644736321
2.2.1.1 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 644736321
2.2.1.2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 644736321
2.3.3.1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 644736321
2.7.9.1 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 644736321
4.2.1.3 0 0 0 4 0 0 0 2 0 0 0 0 0 2 0 0 644736321
Acetobacter pasteurianus IFO 3283-01-42C 1.1.1.42 0 0 0 0 0 0 0 0 0 0 4 0 2 0 0 0 646862301
1.2.4.2 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 646862301
2.7.9.1 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 646862301
4.2.1.2 0 0 0 2 0 2 0 2 0 0 0 0 0 0 0 0 646862301
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
delta proteobacterium NaphS2 5.3.1.9 0 8 66 124 14 40 34 48 50 2 8 0 2 4 0 0 648276759
delta proteobacterium sp. MLMS-1 1.1.1.42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 638341245
2.2.1.1 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 638341245
2.3.3.1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 638341245
4.2.1.11 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 638341245
gamma proteobacterium IMCC1989 1.2.4.1 0 0 0 2 0 2 0 0 0 0 2 0 0 0 0 0 651324112
2.7.2.3 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 651324112
2.7.9.2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 651324112
4.1.1.3 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 651324112
4.2.1.2 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 651324112
4.2.1.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 651324112
6.4.1.1 0 2 0 0 2 0 2 2 2 2 0 0 0 0 0 4 651324112
gamma proteobacterium IMCC2047 1.1.1.38 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 651324113
1.1.1.40 0 0 2 0 0 2 0 0 0 0 0 2 0 0 0 0 651324113
4.2.1.2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 651324113
gamma proteobacterium IMCC3088 1.1.1.40 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 651324114
1.1.1.42 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 651324114
1.2.4.1 2 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 651324114
1.2.7.3 0 16 0 2 0 2 0 0 2 2 4 8 8 4 0 2 1302648228
1.3.5.1 2 0 2 2 4 2 4 2 2 6 0 0 2 0 0 2 651324114
2.3.3.1 0 4 2 2 0 0 0 0 0 0 0 2 6 2 0 0 1302648228
4.1.2.13 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 651324114
4.2.1.2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 651324114
6.2.1.5 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 651324114
gamma proteobacterium sp. HTCC5015 1.2.4.1 0 4 0 0 2 2 0 0 0 0 0 0 4 0 2 0 647533248
1.2.4.2 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 647533248
2.7.9.2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 647533248
4.2.1.11 0 0 0 0 2 0 4 2 2 0 0 0 0 0 0 0 647533248
4.2.1.2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 647533248
6.2.1.5 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 1295066496

15120 rows × 17 columns


In [10]:
collapsed_df.to_csv('table-by-species-ec.tsv', sep='\t')

In [ ]: