In [1]:
from IPython.display import display
import pandas as pd
In [2]:
old_study = '../analysis_data/old_study.txt.gz'
old_map = '../analysis_data/old_taxa.txt'
In [3]:
pd_old = pd.read_csv(old_study, sep='\t', header=1, index_col=0)
In [4]:
del pd_old['MP_NegC_1']
del pd_old['MP_NegC_2']
In [5]:
pd_old_map = pd.read_csv(old_map, sep='\t', header=None, index_col=0)
pd_old_map.rename(columns={1: 'taxonomy'}, inplace=True)
pd_old_map.index.name = '#OTU ID'
In [6]:
pd_old_complete = pd_old.join(pd_old_map, how='inner')
pd_old_complete.set_index(['taxonomy'], inplace=True)
In [7]:
old_sum = pd_old_complete.sum(1).sort_values(ascending=False)
old_sum
Out[7]:
taxonomy
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Rhodobacter; 215369.0
k__Bacteria; p__Firmicutes; c__Bacilli; o__Exiguobacterales; f__Exiguobacteraceae; g__Exiguobacterium; s__Exiguobacterium 80004.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Thioclava; 73688.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Rivulariaceae; 64685.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; 60680.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Kaistobacter; 58333.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; 57955.0
k__Bacteria; p__Cyanobacteria; c__Oscillatoriophycideae; o__Oscillatoriales; f__Phormidiaceae; g__Microcoleus; s__Microcoleus 52812.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; g__Nostoc; 49108.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Rhizobiaceae; g__Ensifer; 41888.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Erythrobacteraceae; g__Porphyrobacter; s__Porphyrobacter 40445.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Rivulariaceae; 34880.0
k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Stramenopiles; 33749.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Rhodobacter; 30469.0
k__Bacteria; p__Verrucomicrobia; c__Verrucomicrobiae; o__Verrucomicrobiales; f__Verrucomicrobiaceae; g__Luteolibacter; 27118.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; 26761.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__; 25295.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Phaeobacter; 25143.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; 21939.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; 21040.0
k__Bacteria; p__Planctomycetes; c__Planctomycea; o__Pirellulales; f__; g__Rhodopirellula; 20698.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Methylophilales; f__Methylophilaceae; g__Methylotenera; 19902.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Delftia; 19344.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Aeromonadales; f__Aeromonadaceae; g__Aeromonas; 19150.0
k__Bacteria; p__Cyanobacteria; c__Oscillatoriophycideae; o__Chroococcales; f__Cyanobacteriaceae; 19144.0
k__Bacteria; p__Bacteroidetes; c__Sphingobacteria; o__Sphingobacteriales; f__Flexibacteraceae; g__Cytophaga; 18834.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Luteimonas; 18551.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; g__Gloeotrichia; s__Gloeotrichia 17660.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; 17408.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingopyxis; 17275.0
...
k__Bacteria; p__Chlamydiae; c__Chlamydiae o__Chlamydiales; 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria 1.0
k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Syntrophobacterales; f__Syntrophobacteraceae; 1.0
k__Bacteria; p__Acidobacteria; c__; o__; f__Koribacteraceae; g__Candidatus 1.0
k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__Myxococcaceae; g__Anaeromyxobacter; 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium; s__Clostridium 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; 1.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodospirillales; f__Rhodospirillaceae; 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella; s__Collinsella 0.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Hyphomicrobiaceae; 0.0
k__Bacteria; p__Gemmatimonadetes; c__Gemmatimonadetes 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Thermomonosporaceae; g__Actinoallomurus; s__Actinoallomurus 0.0
k__Bacteria; p__Acidobacteria; c__Acidobacteria o__Acidobacteriales; 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Acidimicrobiales; f__CL500-29; 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Acidimicrobiales; f__EB1017; 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Actinomycetaceae; g__Varibaculum; s__Varibaculum 0.0
k__Bacteria; p__Acidobacteria; c__Solibacteres; o__Solibacterales; f__Solibacteraceae; g__Candidatus 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Acidimicrobiales; f__EB1017; 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__MC47; 0.0
k__Bacteria; p__Chloroflexi; c__Anaerolineae; o__SJA-101; f__SHA-31; 0.0
k__Bacteria; p__Tenericutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__Eubacterium 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Nocardioidaceae; g__Nocardioides; 0.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Burkholderiaceae; g__Burkholderia; s__Burkholderia 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Corynebacteriaceae; g__Corynebacterium; s__Corynebacterium 0.0
k__Bacteria; p__Acidobacteria; c__Solibacteres; o__Solibacterales; f__Solibacteraceae; g__Candidatus 0.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Burkholderiaceae; g__Burkholderia; 0.0
k__Bacteria; p__Tenericutes; c__Mollicutes; o__Entomoplasmatales; f__Spiroplasmataceae; g__Spiroplasma; s__Spiroplasma 0.0
k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; o__Myxococcales; f__Myxococcaceae; g__Anaeromyxobacter; 0.0
k__Bacteria; p__Aquificae; c__Aquificae o__Aquificales; f__Aquificaceae; g__Thermocrinis; s__Thermocrinis 0.0
dtype: float64
In [8]:
old_group = old_sum.groupby(old_sum.index).sum().sort_values(ascending=False)
old_group
Out[8]:
taxonomy
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Rhodobacter; 293754.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; 165550.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Rivulariaceae; 104734.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; 103557.0
k__Bacteria; p__Firmicutes; c__Bacilli; o__Exiguobacterales; f__Exiguobacteraceae; g__Exiguobacterium; s__Exiguobacterium 88510.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; g__Nostoc; 84379.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Thioclava; 83049.0
k__Bacteria; p__Bacteroidetes; c__Sphingobacteria; o__Sphingobacteriales; f__Saprospiraceae; 73607.0
k__Bacteria; p__Verrucomicrobia; c__Verrucomicrobiae; o__Verrucomicrobiales; f__Verrucomicrobiaceae; g__Luteolibacter; 67324.0
k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Stramenopiles; 66928.0
k__Bacteria; p__Bacteroidetes; c__Sphingobacteria; o__Sphingobacteriales; f__Flexibacteraceae; g__Cytophaga; 66522.0
k__Bacteria; p__Bacteroidetes; c__Sphingobacteria; o__Sphingobacteriales; f__; 65522.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Kaistobacter; 60619.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; 57803.0
k__Bacteria; p__Planctomycetes; c__Planctomycea; o__Pirellulales; f__; g__Rhodopirellula; 55378.0
k__Bacteria; p__Cyanobacteria; c__Oscillatoriophycideae; o__Oscillatoriales; f__Phormidiaceae; g__Microcoleus; s__Microcoleus 53096.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Rhizobiaceae; g__Ensifer; 43464.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Erythrobacteraceae; g__Porphyrobacter; s__Porphyrobacter 40557.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__; 37940.0
k__Bacteria; p__Planctomycetes; c__Planctomycea; o__Gemmatales; f__Gemmataceae; g__Gemmata; 37271.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Phaeobacter; 25143.0
k__Bacteria; p__Cyanobacteria; c__Nostocophycideae; o__Nostocales; f__Nostocaceae; g__Dolichospermum; 24773.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingopyxis; 24769.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Methylophilales; f__Methylophilaceae; g__Methylotenera; 24605.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Chromatiales; f__Sinobacteraceae; 22745.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Methyloversatilis; 21579.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Methylibium; 21451.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; 21425.0
k__Bacteria; p__Cyanobacteria; c__Oscillatoriophycideae; o__Chroococcales; f__Cyanobacteriaceae; 19644.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Delftia; 19489.0
...
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Nocardiopsaceae; g__Nocardiopsis; s__Nocardiopsis 1.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Thiotrichales; f__Thiotrichaceae; g__Thioploca; s__Thioploca 1.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodospirillales; f__Rhodospirillaceae; g__Rhodocista; s__Rhodocista 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Acidimicrobiales; f__TK06; 1.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Oceanospirillales; f__OM60; g__Congregibacter; s__Congregibacter 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Thermosinus; s__Thermosinus 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Microbacteriaceae; g__Yonghaparkia; 1.0
k__Bacteria; p__Bacteroidetes; c__Sphingobacteria; o__Sphingobacteriales; f__Balneolaceae; g__Balneola; s__Balneola 1.0
k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Hyphomicrobiaceae; g__Devosia; s__Candidatus 1.0
k__Bacteria; p__SBR1093; c__EC214; 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Micromonosporaceae; g__Virgisporangium; s__Virgisporangium 1.0
k__Bacteria; p__Planctomycetes; c__vadinHA49; 1.0
k__Bacteria; p__Planctomycetes; c__Planctomycea; o__Pirellulales; 1.0
k__Bacteria; p__Chloroflexi; 1.0
k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Salinicoccus; s__Salinicoccus 1.0
k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Chromatiales; f__Thiotrichaceae; 1.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Rhodocyclales; f__Rhodocyclaceae; g__Azospira; 1.0
k__Bacteria; p__Tenericutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__p-75-a5; 1.0
k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Nitrosomonadales; f__Nitrosomonadaceae; g__Nitrosovibrio; s__Nitrosovibrio 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Thermoanaerobacterales; f__Thermoanaerobacteraceae; g__Thermoanaerobacter; 1.0
k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Peptostreptococcaceae; g__Peptostreptococcus; 1.0
k__Bacteria; p__Tenericutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__Clostridium; s__Clostridium 1.0
k__Bacteria; p__Chloroflexi; c__Dehalococcoidetes; o__Dehalococcoidales; f__Dehalococcoidaceae; g__Dehalococcoides; 1.0
k__Bacteria; p__Planctomycetes; c__Planctomycea; o__Gemmatales; f__Isosphaeraceae; g__Singulisphaera; s__Singulisphaera 1.0
k__Bacteria; p__Tenericutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__Allobaculum; s__Allobaculum 1.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Corynebacteriaceae; g__Corynebacterium; s__Corynebacterium 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella; s__Collinsella 0.0
k__Bacteria; p__Actinobacteria; c__Actinobacteria o__Actinomycetales; f__Actinomycetaceae; g__Varibaculum; s__Varibaculum 0.0
k__Bacteria; p__Aquificae; c__Aquificae o__Aquificales; f__Aquificaceae; g__Thermocrinis; s__Thermocrinis 0.0
k__Bacteria; p__Tenericutes; c__Mollicutes; o__Entomoplasmatales; f__Spiroplasmataceae; g__Spiroplasma; s__Spiroplasma 0.0
dtype: float64
In [9]:
old_group.sort_index(inplace=True)
old_group.get_values()
Out[9]:
array([ 9., 30., 41., ..., 107., 11., 15.])
In [10]:
old_org = pd.DataFrame(old_group.get_values(), index=pd.MultiIndex.from_tuples([tuple(x.split(' ')) for x in old_group.index]))
pd.set_option('display.max_rows', len(old_org))
#display(old_org)
pd.reset_option('display.max_rows')
In [22]:
old_org.index.names = [i for i in range(len(old_org.index.names))]
old_org.groupby(level=[0, 1]).sum().sort_values(0, ascending=False)
Out[22]:
0
0
1
k__Bacteria;
p__Proteobacteria;
1514474.0
p__Cyanobacteria;
526554.0
p__Bacteroidetes;
318190.0
p__Firmicutes;
133312.0
p__Planctomycetes;
132029.0
p__Verrucomicrobia;
122162.0
p__Actinobacteria;
46439.0
p__Acidobacteria;
15619.0
p__Chloroflexi;
10231.0
p__Gemmatimonadetes;
9531.0
p__Thermi;
4666.0
p__Tenericutes;
4655.0
p__Armatimonadetes;
4349.0
p__Chlamydiae;
3371.0
p__Chlorobi;
2265.0
p__TM6;
1572.0
p__SR1;
1246.0
p__Fusobacteria;
1082.0
p__Nitrospirae;
713.0
k__Archaea;
p__Euryarchaeota;
454.0
k__Bacteria;
p__GN02;
436.0
p__SM2F11;
383.0
p__Spirochaetes;
370.0
p__Elusimicrobia;
281.0
p__Hyd24-12;
242.0
k__Archaea;
p__Crenarchaeota;
231.0
k__Bacteria;
p__TM7;
196.0
p__GOUTA4;
148.0
p__Thermotogae;
142.0
p__NKB19;
122.0
...
...
p__WS3;
113.0
p__OP11;
95.0
p__SAR406;
87.0
p__Synergistetes;
78.0
p__Deferribacteres;
76.0
p__Aquificae;
68.0
p__GN1;
58.0
p__AD3;
32.0
p__SPAM;
28.0
p__TG3;
27.0
p__OP9;
26.0
p__ZB3;
15.0
p__NC10;
14.0
p__WPS-2;
14.0
p__MVP-15;
13.0
p__OP3;
11.0
p__ZB2;
11.0
p__SC3;
8.0
p__BRC1;
8.0
p__GN04;
7.0
p__49S1_2B;
7.0
p__OP8;
6.0
p__SBR1093;
5.0
p__Lentisphaerae;
4.0
p__LCP-89;
4.0
p__Dictyoglomi;
2.0
p__GN06;
2.0
p__KSB1;
2.0
p__OP1;
1.0
p__HDBW-WB69;
1.0
61 rows × 1 columns
In [ ]:
Content source: tiagoantao/MARC
Similar notebooks: