In [1]:
from __future__ import division
import pandas as pd
from StringIO import StringIO
import pandas as pd

fp = "/Users/caporaso/data/img-qiime-25oct2012/gene_ec_numeric.tsv"
f = open("/Users/caporaso/data/img-qiime-25oct2012/gene_ec_numeric.tsv", 'U')

data = []

for line in f:
    fields = line.strip().split('\t')
    if fields[1] == 'unknown':
        ec_fields = None
    else:
        ec_fields = fields[1].replace(';', '.')
    data.append([fields[0]] + [ec_fields])

df = pd.DataFrame(data, columns=['Gene ID', 'EC'])
img_ec_counts = df.groupby(['EC']).count()

These are the Central C metabolic network EC code abundances in the Fierer et al 2012 soil metagenomes, as computed with QIIME 1.9.0 by Greg Caporaso. Here I load these, and then add in the corresponding EC counts from the QIIME/IMG reference database (25 Oct 2012).


In [14]:
ec_biom = pd.read_csv('observation_table_L1.txt', sep='\t', header=1, index_col=0)
ec_biom['IMG Count'] = [img_ec_counts['Gene ID'][e] for e in ec_biom.index]

In [15]:
ec_biom


Out[15]:
SF2 EB024 KP1 PE6 TL1 AR3 BZ1 CL1 DF1 SV1 MD3 EB017 EB020 EB021 EB019 EB026 IMG Count
#OTU ID
1.1.1.37 0.017810 0.017309 0.020856 0.016496 0.024085 0.021733 0.023032 0.017188 0.022540 0.016076 0.016914 0.020178 0.021898 0.014984 0.018831 0.024107 2762
1.1.1.38 0.014922 0.014055 0.006450 0.005193 0.002942 0.009229 0.003965 0.006531 0.007971 0.015177 0.017327 0.015133 0.014436 0.017681 0.005472 0.004219 2435
1.1.1.39 0.001685 0.000000 0.000430 0.000458 0.000735 0.000298 0.001322 0.001031 0.001924 0.000787 0.000619 0.000605 0.000000 0.000000 0.001127 0.000603 92
1.1.1.40 0.013478 0.012233 0.021071 0.024591 0.028130 0.016374 0.024920 0.031282 0.029137 0.016751 0.012583 0.013721 0.013139 0.004795 0.018992 0.023505 1115
1.1.1.41 0.008664 0.008459 0.003870 0.009928 0.007170 0.011015 0.006985 0.006360 0.003848 0.007532 0.007632 0.008878 0.010381 0.010488 0.003541 0.002863 820
1.1.1.42 0.059928 0.083550 0.055257 0.056667 0.042103 0.048824 0.053238 0.055174 0.049753 0.061720 0.053424 0.084140 0.075750 0.069224 0.069693 0.059063 2482
1.1.1.44 0.012515 0.013274 0.023866 0.028868 0.024453 0.023817 0.025297 0.024063 0.024189 0.012591 0.016089 0.009080 0.013301 0.017081 0.007404 0.006780 2789
1.1.1.49 0.027437 0.027199 0.024726 0.033145 0.029785 0.025305 0.035303 0.031110 0.030511 0.022822 0.028259 0.021186 0.027737 0.030866 0.015451 0.014314 2773
1.1.1.82 0.000481 0.000000 0.000000 0.000153 0.000000 0.000000 0.000000 0.000172 0.000000 0.000112 0.001238 0.000000 0.000000 0.000000 0.000000 0.000000 31
1.1.5.4 0.002888 0.000390 0.003655 0.004277 0.000735 0.003870 0.003587 0.003781 0.003848 0.003485 0.004332 0.001614 0.001946 0.002697 0.001931 0.001356 1157
1.2.1.12 0.028881 0.026809 0.027306 0.022911 0.027395 0.022626 0.029262 0.023376 0.017317 0.029117 0.029703 0.023810 0.030495 0.026071 0.023660 0.031189 4847
1.2.1.59 0.000481 0.000390 0.000000 0.000153 0.000000 0.000000 0.000000 0.000000 0.000000 0.000112 0.000413 0.000000 0.000000 0.000000 0.000000 0.001959 189
1.2.1.9 0.001925 0.000781 0.002150 0.001222 0.001287 0.002679 0.000566 0.001375 0.001649 0.002248 0.003094 0.000404 0.000162 0.000000 0.002414 0.007684 507
1.2.4.1 0.111913 0.106976 0.107934 0.106614 0.106086 0.100923 0.089296 0.101066 0.095657 0.113210 0.111386 0.072437 0.104298 0.103686 0.106068 0.088444 6217
1.2.4.2 0.047172 0.054008 0.044507 0.038491 0.040632 0.038107 0.042666 0.038501 0.046729 0.051265 0.055899 0.036521 0.048175 0.045850 0.061806 0.051680 1887
1.2.7.1 0.008183 0.001562 0.004300 0.007179 0.013054 0.013992 0.001699 0.001891 0.002199 0.006408 0.007219 0.004843 0.003244 0.000599 0.000322 0.000301 1573
1.2.7.3 0.046450 0.039693 0.044937 0.064152 0.071337 0.058946 0.055692 0.049330 0.047554 0.049466 0.048267 0.053471 0.058070 0.032964 0.007726 0.022299 4120
1.3.5.1 0.016606 0.012363 0.018491 0.016954 0.018202 0.015183 0.025864 0.024923 0.024739 0.019562 0.012170 0.014124 0.013139 0.008391 0.000966 0.014314 525
1.8.1.4 0.027918 0.022775 0.033326 0.022148 0.026108 0.027389 0.025675 0.029907 0.028312 0.031141 0.029909 0.020581 0.025142 0.019778 0.042813 0.052132 5230
2.2.1.1 0.038267 0.041775 0.055687 0.051627 0.042839 0.058351 0.049462 0.053455 0.063496 0.044407 0.045998 0.065174 0.038767 0.056638 0.055046 0.049571 5894
2.2.1.2 0.007461 0.007548 0.008600 0.012219 0.009377 0.008336 0.010383 0.010657 0.012095 0.006071 0.006188 0.007667 0.006164 0.007492 0.009013 0.012054 3250
2.3.1.12 0.006258 0.006117 0.009675 0.007637 0.014892 0.008038 0.011138 0.012547 0.007971 0.008657 0.009695 0.004843 0.008921 0.007492 0.016900 0.016122 3201
2.3.1.61 0.012756 0.013014 0.014836 0.013136 0.014341 0.016374 0.015669 0.017876 0.016218 0.015177 0.012789 0.012712 0.012003 0.009889 0.027201 0.026217 2138
2.3.3.1 0.062335 0.056871 0.041926 0.037574 0.046148 0.044061 0.031716 0.042111 0.040957 0.061270 0.056312 0.054681 0.062936 0.072820 0.051183 0.041434 3455
2.3.3.8 0.000722 0.000390 0.000645 0.001069 0.000368 0.002679 0.000566 0.000172 0.000275 0.001237 0.000206 0.001211 0.000487 0.000000 0.000000 0.000151 197
2.7.1.1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000225 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 390
2.7.1.11 0.009146 0.009370 0.014836 0.015274 0.015076 0.013695 0.011327 0.009110 0.013469 0.011355 0.015677 0.013519 0.013463 0.015283 0.013681 0.011602 3498
2.7.1.2 0.003369 0.002212 0.001720 0.003513 0.004780 0.001191 0.002265 0.002406 0.003299 0.002811 0.003300 0.002220 0.003082 0.002697 0.001127 0.002863 3252
2.7.1.40 0.004091 0.002733 0.005160 0.005193 0.005700 0.003275 0.007363 0.006188 0.004948 0.004272 0.004950 0.003834 0.003244 0.004795 0.014003 0.014163 4086
2.7.1.90 0.000481 0.000130 0.005805 0.000611 0.000735 0.002679 0.000189 0.000859 0.003299 0.000562 0.000206 0.000000 0.000000 0.000000 0.000000 0.000000 427
2.7.2.3 0.005295 0.005206 0.004730 0.003513 0.007170 0.003275 0.006419 0.006875 0.007147 0.004272 0.003507 0.005851 0.003731 0.001798 0.003541 0.010095 3305
2.7.9.1 0.036101 0.032145 0.033756 0.034672 0.043574 0.041679 0.036436 0.028188 0.037658 0.032490 0.034860 0.027643 0.029846 0.020677 0.010140 0.012355 1222
2.7.9.2 0.018773 0.006377 0.018491 0.010539 0.019305 0.013397 0.010383 0.009453 0.006872 0.017313 0.018358 0.014124 0.009246 0.008091 0.001449 0.016423 1838
3.1.1.31 0.000241 0.000390 0.000000 0.001375 0.001103 0.000595 0.001699 0.001547 0.001649 0.000562 0.000825 0.002825 0.000973 0.000300 0.000322 0.000603 2430
3.1.3.11 0.008664 0.009891 0.005805 0.007026 0.007722 0.005656 0.006041 0.007219 0.005772 0.006970 0.010107 0.005650 0.007786 0.011088 0.018027 0.020341 3329
4.1.1.3 0.001925 0.000130 0.000430 0.001527 0.002574 0.000298 0.000189 0.000859 0.000275 0.000562 0.000206 0.000000 0.000000 0.000000 0.000000 0.000753 1976
4.1.1.31 0.007220 0.005856 0.007525 0.006721 0.007538 0.005061 0.014348 0.009969 0.009621 0.007532 0.008870 0.004843 0.003569 0.018280 0.001610 0.020341 1545
4.1.1.32 0.037064 0.071317 0.035261 0.030701 0.026108 0.036320 0.033793 0.030595 0.025014 0.036200 0.039191 0.076877 0.060503 0.050644 0.046193 0.022751 629
4.1.1.49 0.025030 0.015747 0.025371 0.018482 0.018937 0.022030 0.021710 0.015297 0.019791 0.021922 0.017739 0.020581 0.021411 0.025172 0.022211 0.027723 1639
4.1.1.71 0.001925 0.000651 0.001505 0.000458 0.000000 0.001489 0.001133 0.002922 0.002749 0.002811 0.002888 0.000404 0.001460 0.010189 0.000483 0.000151 76
4.1.2.13 0.027677 0.025247 0.024296 0.023217 0.025188 0.022328 0.028507 0.026470 0.024189 0.023834 0.023515 0.029056 0.019627 0.021277 0.030903 0.035558 5019
4.1.2.14 0.000963 0.000651 0.001505 0.000764 0.000735 0.001489 0.001510 0.000344 0.001374 0.000450 0.000206 0.002018 0.000811 0.000599 0.000644 0.001356 2226
4.2.1.11 0.025271 0.027330 0.017846 0.020773 0.020592 0.022030 0.022466 0.022173 0.018692 0.019224 0.022690 0.025222 0.024655 0.025472 0.030742 0.030887 3516
4.2.1.2 0.042118 0.034487 0.038486 0.062013 0.041552 0.041977 0.056825 0.076831 0.059923 0.037662 0.029497 0.042978 0.031306 0.030566 0.032834 0.041434 4744
4.2.1.3 0.080144 0.085632 0.070522 0.051932 0.059018 0.069664 0.070795 0.057752 0.061847 0.073300 0.085602 0.083939 0.080779 0.096794 0.105424 0.078047 3856
5.1.3.1 0.005776 0.006507 0.007095 0.006262 0.006803 0.008634 0.005852 0.007047 0.006047 0.006408 0.004744 0.006255 0.006002 0.007192 0.007404 0.010246 3495
5.3.1.1 0.001444 0.000521 0.001290 0.001375 0.001287 0.000595 0.001322 0.002234 0.000825 0.001574 0.002269 0.000404 0.001622 0.000899 0.002092 0.003013 3549
5.3.1.6 0.002647 0.001692 0.004085 0.002444 0.002390 0.004168 0.003209 0.002234 0.004673 0.002811 0.004332 0.001009 0.001622 0.002098 0.000644 0.002561 4338
5.3.1.9 0.019495 0.017699 0.046227 0.051932 0.026843 0.041679 0.027185 0.034548 0.043705 0.021585 0.019596 0.018765 0.018167 0.014384 0.021729 0.024409 3347
6.2.1.4 0.000722 0.000651 0.001075 0.000916 0.000919 0.001489 0.000944 0.001031 0.001100 0.001124 0.000000 0.002220 0.001298 0.000300 0.000000 0.000452 347
6.2.1.5 0.040433 0.053748 0.039131 0.039255 0.044310 0.040488 0.046819 0.036267 0.039857 0.047330 0.039191 0.049839 0.044120 0.051543 0.039112 0.047461 4444
6.4.1.1 0.016847 0.016137 0.013545 0.016649 0.017834 0.016672 0.013970 0.017704 0.017317 0.018437 0.020008 0.012914 0.021087 0.020378 0.048125 0.012054 1550

Next, I compute the Spearman correlation between all of these abundances.


In [4]:
correlation_m = ec_biom.corr(method='spearman')

It looks like the counts on a per-metagenome basis are not just reflections of the abundances of those EC codes in the database, which is good. This was performed as a control.


In [16]:
correlation_m['IMG Count']


Out[16]:
SF2          0.552581
EB024        0.569684
KP1          0.562419
PE6          0.578918
TL1          0.591514
AR3          0.548234
BZ1          0.584805
CL1          0.568916
DF1          0.555027
SV1          0.537885
MD3          0.560644
EB017        0.547447
EB020        0.569616
EB021        0.560016
EB019        0.603022
EB026        0.638898
IMG Count    1.000000
Name: IMG Count, dtype: float64

It also looks like, in general, the abundances are correlated across soils, which is also a good control (i.e., they are fairly repeatable across metagenomes, so we're probably not just looking at random noise).


In [17]:
correlation_m


Out[17]:
SF2 EB024 KP1 PE6 TL1 AR3 BZ1 CL1 DF1 SV1 MD3 EB017 EB020 EB021 EB019 EB026 IMG Count
SF2 1.000000 0.971681 0.953232 0.954394 0.956385 0.962455 0.951231 0.947285 0.940659 0.989835 0.983194 0.967914 0.976669 0.941802 0.877002 0.898420 0.552581
EB024 0.971681 1.000000 0.943569 0.944071 0.938282 0.955933 0.949812 0.944819 0.940904 0.969692 0.966467 0.972437 0.985939 0.955803 0.901202 0.895901 0.569684
KP1 0.953232 0.943569 1.000000 0.960966 0.955248 0.972255 0.956970 0.961223 0.974224 0.963672 0.946224 0.933983 0.943290 0.901943 0.861446 0.894351 0.562419
PE6 0.954394 0.944071 0.960966 1.000000 0.976700 0.973178 0.972031 0.971414 0.963830 0.957978 0.941430 0.939336 0.948706 0.894883 0.850572 0.868899 0.578918
TL1 0.956385 0.938282 0.955248 0.976700 1.000000 0.955224 0.961341 0.956689 0.948182 0.955742 0.942819 0.935745 0.943258 0.879975 0.839207 0.879488 0.591514
AR3 0.962455 0.955933 0.972255 0.973178 0.955224 1.000000 0.953038 0.948878 0.957606 0.968262 0.954636 0.948721 0.958565 0.910152 0.848246 0.849643 0.548234
BZ1 0.951231 0.949812 0.956970 0.972031 0.961341 0.953038 1.000000 0.979098 0.975038 0.962866 0.943348 0.950931 0.955988 0.922703 0.860994 0.893511 0.584805
CL1 0.947285 0.944819 0.961223 0.971414 0.956689 0.948878 0.979098 1.000000 0.984520 0.962378 0.940086 0.932181 0.946314 0.911332 0.882572 0.903470 0.568916
DF1 0.940659 0.940904 0.974224 0.963830 0.948182 0.957606 0.975038 0.984520 1.000000 0.954301 0.933245 0.931446 0.939983 0.902166 0.862762 0.881292 0.555027
SV1 0.989835 0.969692 0.963672 0.957978 0.955742 0.968262 0.962866 0.962378 0.954301 1.000000 0.980993 0.968339 0.979552 0.944857 0.879388 0.900335 0.537885
MD3 0.983194 0.966467 0.946224 0.941430 0.942819 0.954636 0.943348 0.940086 0.933245 0.980993 1.000000 0.955072 0.975791 0.950739 0.890163 0.889722 0.560644
EB017 0.967914 0.972437 0.933983 0.939336 0.935745 0.948721 0.950931 0.932181 0.931446 0.968339 0.955072 1.000000 0.978454 0.933023 0.864222 0.878635 0.547447
EB020 0.976669 0.985939 0.943290 0.948706 0.943258 0.958565 0.955988 0.946314 0.939983 0.979552 0.975791 0.978454 1.000000 0.962633 0.894338 0.880732 0.569616
EB021 0.941802 0.955803 0.901943 0.894883 0.879975 0.910152 0.922703 0.911332 0.902166 0.944857 0.950739 0.933023 0.962633 1.000000 0.869620 0.853994 0.560016
EB019 0.877002 0.901202 0.861446 0.850572 0.839207 0.848246 0.860994 0.882572 0.862762 0.879388 0.890163 0.864222 0.894338 0.869620 1.000000 0.920820 0.603022
EB026 0.898420 0.895901 0.894351 0.868899 0.879488 0.849643 0.893511 0.903470 0.881292 0.900335 0.889722 0.878635 0.880732 0.853994 0.920820 1.000000 0.638898
IMG Count 0.552581 0.569684 0.562419 0.578918 0.591514 0.548234 0.584805 0.568916 0.555027 0.537885 0.560644 0.547447 0.569616 0.560016 0.603022 0.638898 1.000000

And just for the record, these are Paul Dijkstra's central C metabolic network "EC codes of interest".


In [7]:
ec_of_interest = StringIO("""1.1.1.286 (isocitrate dehydrogenase)
1.1.1.37 (malate dehydrogenase to oxaloacetate)
1.1.1.38 (malate dehydrogenase /malic enzyme to pyruvate)
1.1.1.39 (malate dehydrogenase /malic enzyme to pyruvate)
1.1.1.40 (malate dehydrogenase/ malic enzyme to pyruvate)
1.1.1.41 (isocitrate dehydrogenase)
1.1.1.42 (isocitrate dehydrogenase)
1.1.1.44 (6 phosphogluconate dehydrogenase)
1.1.1.49 (glucose-6P dehydrogenase to gluconolactone-P)
1.1.1.82 (malate dehydrogenase to oxaloacetate)
1.1.1.343 (phosphogluconate dehydrogenase)
1.1.1.351 (phosphogluconate dehydrogenase)
1.1.5.4 (malate oxidoreductase to oxaloacetate)
1.2.1.9 (glyceraldehyde dehydrogenase - glycerald to glycerate)
1.2.1.12 (glyceraldehyde3P dehydrogenase)
1.2.1.59 (glyceraldehyde3P dehydrogenase)
1.2.1.79 (succinate semialdehyde dehydrogenase to succinate)
1.2.1.9 (glyceraldehyde3P dehydrogenase)
1.2.4.1 (pyruvate dehydrogenase – split CO2)
1.2.4.2 (oxoglutarate dehydrogenase /decarboxylase/PDH complex E3 – split CO2 from oxoglutarate)
1.2.7.1 (pyruvate synthase /oxidoreductase to acetyl-CoA)
1.2.7.3 (oxoglutarate synthase to succinyl-coa)
1.2.7.6 (glyceraldehyde3P dehydrogenase /oxidoreductase to 3Pglycerate)
1.3.5.1 (succinate dehydrogenase to fumarate)
1.3.5.4 (fumarate reductase to succinate)
1.8.1.4 (dihydrolipoyl dehydrogenase – PDH and OGL complex – NADH generating subunit)
2.2.1.1 (transketolase ppp reaction)
2.2.1.2 (transaldolase ppp reaction)
2.3.1.12 (PDH complex number 3 produces acetyl CoA)
2.3.1.61 (dihydropoyllysine-residue succinyl transferase – complex 3 to succinyl-CoA)
2.3.3.1 (citrate synthase OAA to citrate)
2.3.3.16 (citrate synthase OAA to citrate)
2.3.3.8 (citrate synthase OAA to citrate)
2.7.1.1 (hexokinase – to G6P)
2.7.1.11 (6 phosphofructokinase F6 to F1,6)
2.7.1.146 (ADP-specific phosphofructokinase from F6P to F16P)
2.7.1.147 (ADP specific glucokinase to G6P)
2.7.1.2 (glucokinase)
2.7.1.40 (pyruvate kinase to PEP)
2.7.1.90 (phosphofructokinase 6 to 16 fructose)
2.7.1.146 (ADP specific phosphofructokinase – pyrococcus)
2.7.2.3 (phosphoglycerate kinase – phosphorylation of 13P2glycerate)
2.7.9.1 (pyruvate phosphate dikinase to PEP)
2.7.9.2 (pyruvate water kinase to PEP)
3.1.1.31 (6-phosphogluconolactonase to gluconate 6P)
3.1.3.11 (fructobisphosphatase 1,6P to 6P)
3.1.3.13 (bisphophoglycerate phosphatase – 2,3 to 3P glycerate)
3.1.3.80 (bisphophoglycerate 3-phosphatase 2,3 to 2P glycerate)
4.1.1.3 (oxaloacetate decarboxylase to pyruvate)
4.1.1.31 (PEP carboxylase to OAA)
4.1.1.32 (PEP carboxykinase to OAA)
4.1.1.38 (PEP carboxykinase to OAA)
4.1.1.49 (PEP carboxykinase to OAA)
4.1.1.71 oxoglutarate decarboxylase
4.1.2.13 (fructose bisphosphate aldolase to DHAP and Glyceraldehyde)
4.1.2.14 (2dehydro-3deoxy phosphogluconate aldolase to pyr and glyceraldehyde ED)
4.1.2.51 (2dehydroxy3deoxy gluconate aldolase to glyceraldehyde and pyruvate ED)
4.1.2.55 (2dehydroxy3deoxy gluconate /galactonate aldolase to glyceraldehyde and pyr ED)
4.2.1.11 (phosphopyruvate hydratase – 2P glyc to PEP)
4.2.1.2 (fumarate hydratase – malate to fum)
4.2.1.3 (aconitate hydratase –citrate to isocitrate)
5.1.3.1 (ribuloseP 3 epimerase – to xylulose)
5.3.1.1 (trioseP isomerase)
5.3.1.6 (ribose isomerase ribose to ribulose)
5.3.1.9 (glucose 6 isomerase to fructose 6P)
5.4.2.11 (phosphoglycerate mutase 2 to 3 glycerate)
5.4.2.12 (phosphoglycerate mutase 2 to 3 glycerate)
5.4.2.4 (bisphosphoglycerate mutase (1,3 to 2,3 glycerate)
6.2.1.4 (succinate coa ligase from succinyl)
6.2.1.5 (succinate coa ligase from succinyl)
6.4.1.1 (pyruvate carboxylase)""")

ec_of_interest = [l.split()[0] for l in ec_of_interest]