Integration of species information into the mapping data from NB

Goal: get proportion of reads from each KO by species


In [1]:
import pandas as pd
import matplotlib as mpl

import matplotlib.pyplot as plt
import cPickle as cpk


%matplotlib inline

In [2]:
#Load PhytoKEGG Annotations
AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')

In [3]:
AllPhytoKO_ann


Out[3]:
kID
Ampcof8 K01762
Ampcof9 K01455
Ampcof22 K15106
Ampcof34 K09838
Ampcof42 K08675
Ampcof48 K03885
Ampcof56 K06639
Ampcof60 K09506
Ampcof64 K09291
Ampcof72 K07769
Ampcof80 K03350
Ampcof89 K12194
Ampcof96 K03004
Ampcof97 K14713
Ampcof133 K16833
Ampcof154 K07652
Ampcof180 K03267
Ampcof188 K02938
Ampcof196 K11292
Ampcof205 K09540
Ampcof222 K01556
Ampcof234 K17871
Ampcof240 K05034
Ampcof249 K03324
Ampcof257 K00045
Ampcof264 K15181
Ampcof268 K17778
Ampcof269 K01874
Ampcof270 K05656
Ampcof271 K03152
... ...
Scyaps32057 K09013
Scyaps32061 K04043
Scyaps32063 K08592
Scyaps32064 K04488
Scyaps32065 K02437
Scyaps32070 K02209
Scyaps32086 K02880
Scyaps32095 K09490
Scyaps32106 K09502
Scyaps32107 K03231
Scyaps32132 K12606
Scyaps32191 K01201
Scyaps32194 K14566
Scyaps32200 K01079
Scyaps32206 K05869
Scyaps32210 K02641
Scyaps32214 K03942
Scyaps32215 K07874
Scyaps32216 K09420
Scyaps32238 K01854
Scyaps32260 K02893
Scyaps32261 K13337
Scyaps32272 K13094
Scyaps32273 K00480
Scyaps32274 K01581
Scyaps32288 K11251
Scyaps32296 K07300
Scyaps32299 K08008
Scyaps32305 K15450
Scyaps32306 K13348

561156 rows × 1 columns


In [4]:
InsituCounts=pd.read_table('Data/AllInsitu.tab', index_col='gID')

In [6]:
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6
#drop any all zero reads for simplicity
InsituCounts=InsituCounts[(InsituCounts.T != 0).any()]
InsituTPM=InsituTPM[(InsituTPM.T != 0).any()]
#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()

In [7]:
#load in the species/group information
Group_Species=pd.read_table('GrpSpecies',delimiter=' ').T.drop(['MMETSP', 
                                                  'MMETSP.1']).T.drop_duplicates().set_index('SName')

In [19]:
InsituTPM.groupby(['kID','sgID']).sum().reset_index()


Out[19]:
kID sgID S1 S2 S3 S4 S5
0 K00001 Aletam 0.000000 0.000000 0.011808 0.079876 0.000000
1 K00001 Ampcof 0.011243 0.044565 0.011808 0.000000 0.000000
2 K00001 Ampmas 0.011243 0.014855 0.035423 0.053250 0.000000
3 K00001 Astgla 0.000000 0.000000 0.011808 0.000000 0.000000
4 K00001 Attsep 0.112434 0.133696 0.047231 0.026625 0.178521
5 K00001 Aulsub 0.000000 0.000000 0.000000 0.000000 0.022315
6 K00001 Chacf. 0.067460 0.029710 0.023615 0.053250 0.022315
7 K00001 Chadeb 0.078704 0.059420 0.460501 0.931881 0.781030
8 K00001 Chadic 0.000000 0.000000 0.000000 0.000000 0.000000
9 K00001 Chaneo 0.000000 0.000000 0.106270 0.266252 0.178521
10 K00001 Chasp 0.011243 0.029710 0.153500 0.159751 0.133891
11 K00001 Chreri 0.000000 0.000000 0.023615 0.000000 0.022315
12 K00001 Corpen 0.000000 0.014855 0.011808 0.000000 0.000000
13 K00001 Crycoh 0.134920 0.103985 0.129885 0.079876 0.044630
14 K00001 Cycmen 0.753306 0.222826 0.106270 0.266252 0.044630
15 K00001 Cylclo 0.022487 0.029710 0.011808 0.000000 0.000000
16 K00001 Dacfra 0.056217 0.000000 0.094462 0.053250 0.022315
17 K00001 Detcon 0.000000 0.044565 0.011808 0.000000 0.044630
18 K00001 Ditbri 0.056217 0.000000 0.047231 0.026625 0.022315
19 K00001 Durbal 0.000000 0.014855 0.000000 0.053250 0.000000
20 K00001 Entsp 0.000000 0.000000 0.000000 0.000000 0.000000
21 K00001 Eucant 0.247354 0.297101 0.614002 1.198133 1.495115
22 K00001 Extspi 0.044973 0.014855 0.023615 0.000000 0.089261
23 K00001 Glefol 0.033730 0.000000 0.000000 0.000000 0.000000
24 K00001 Graoce 0.000000 0.000000 0.011808 0.000000 0.000000
25 K00001 Heltam 0.000000 0.000000 0.011808 0.000000 0.000000
26 K00001 Hetrot 0.000000 0.000000 0.000000 0.000000 0.022315
27 K00001 Karbre 0.011243 0.000000 0.035423 0.053250 0.156206
28 K00001 Karmic 0.022487 0.029710 0.011808 0.026625 0.000000
29 K00001 Kryfol 0.022487 0.014855 0.059039 0.026625 0.044630
... ... ... ... ... ... ... ...
272444 K18277 Pelbei 0.460978 0.326811 0.590386 0.186376 0.870291
272445 K18277 Peraci 21.857112 42.366628 17.369167 24.308786 28.027832
272446 K18277 Phaant 0.134920 0.133696 0.165308 0.106501 0.156206
272447 K18277 Phasp 0.000000 0.000000 0.011808 0.000000 0.000000
272448 K18277 Proala 0.000000 0.044565 0.035423 0.000000 0.000000
272449 K18277 Proine 0.000000 0.029710 0.035423 0.053250 0.044630
272450 K18277 Proret 0.089947 0.252536 0.177116 0.133126 0.178521
272451 K18277 Pseare 0.112434 0.029710 0.047231 0.106501 0.000000
272452 K18277 Pseaus 0.022487 0.029710 0.023615 0.079876 0.022315
272453 K18277 Psedel 0.056217 0.000000 0.011808 0.053250 0.022315
272454 K18277 Psefra 0.044973 0.014855 0.000000 0.133126 0.089261
272455 K18277 Psepun 0.044973 0.014855 0.094462 0.000000 0.000000
272456 K18277 Pyrbah 0.202381 0.222826 0.259770 0.213001 0.178521
272457 K18277 Scrhan 0.146164 0.356521 0.318809 0.266252 0.178521
272458 K18277 Scrtro 0.022487 0.089130 0.047231 0.053250 0.022315
272459 K18277 Skedoh 0.089947 7.620645 0.047231 0.026625 0.111576
272460 K18277 Skemar 0.674602 43.644163 0.153500 0.106501 1.004182
272461 K18277 Skemen 0.089947 0.133696 0.000000 0.026625 0.022315
272462 K18277 Stacon 0.011243 0.029710 0.118077 0.213001 0.044630
272463 K18277 Stetur 0.168651 0.059420 0.177116 0.213001 0.245467
272464 K18277 Struni 0.213624 0.014855 0.000000 0.000000 0.000000
272465 K18277 Synrec 0.000000 0.029710 0.047231 0.000000 0.044630
272466 K18277 Thaant 0.528438 0.118840 1.464158 0.612379 1.829843
272467 K18277 Thafra 0.011243 0.029710 0.118077 0.079876 0.089261
272468 K18277 Thamin 0.101190 0.014855 0.011808 0.079876 0.044630
272469 K18277 Thanit 0.101190 0.059420 0.059039 0.079876 0.022315
272470 K18277 Thapun 0.000000 0.014855 0.000000 0.000000 0.000000
272471 K18277 Tharot 0.000000 0.044565 0.082654 0.079876 0.022315
272472 K18277 Thasp 0.033730 0.014855 0.047231 0.079876 0.000000
272473 K18277 Tridub 0.056217 0.059420 0.212539 0.186376 0.401673

272474 rows × 7 columns


In [ ]:
gs=Group_Species.index
for g in gs:
    print g

In [63]:
Group_Species


Out[63]:
Grp
SName
Ochsp Ochrophyta
Prypar Haptophyta
Graoce Bacillariophyta
Corhys Bacillariophyta
Rhomar Rhodophyta
Skecos Bacillariophyta
Nitsp Bacillariophyta
Odoaur Bacillariophyta
Cylclo Bacillariophyta
Urosp Ciliophora
Dinsp Ochrophyta
Karbre Dinophyta
Dolten Chlorophyta
Neppyr Chlorophyta
Crypar Cryptophyta
Eutgym Euglenozoa
Lotoce Cercozoa
Lotglo Cercozoa
Lotamo Cercozoa
Hemand Cryptophyta
Bignat Chlorarachniophyta
Guithe Cryptophyta
Polpar Chlorophyta
Pyrpar Chlorophyta
D1 Unknown
Alemon Dinophyta
Ptedan Ochrophyta
ParImp Ochrophyta
Acasp Sarcomastigophora
GonPac Cryptophyta
... ...
Hemruf Cryptophyta
Biglon Chlorarachniophyta
Gepoce Haptophyta
Eupcra Ciliophora
Ammsp Foraminifera
Batpra Chlorophyta
Psehei Bacillariophyta
Chabre Bacillariophyta
Aleand Dinophyta
Eucant Bacillariophyta
Ptesp Chlorophyta
Gonspi Pyrrophycophyta
Polgla Alveolata
Hetarc Alveolata
Craaus Ochrophyta
Entsp Bacillariophyta
Pyrsp Chlorophyta
CCMP2111 Unknown
Chadic Bacillariophyta
Attsep Bacillariophyta
Madery Rhodophyta
Vitbra Alveolata
Synpus Unknown
RCC701 Unknown
RCC1871 Unknown
Branut Alveolata
Phacor Haptophyta
CCMP2135 Unknown
Mansp Chlorophyta
CCMP1999 Unknown

283 rows × 1 columns


In [ ]: