In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [8]:
mf = pd.read_csv('mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID', na_values='Missing:Not collected')

In [80]:
mf.head()


Out[80]:
BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model Title ... OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus Description
#SampleID
BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 25.8 0.0 0.0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency Orwoll.BI0023.BI
BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 39.2 0.0 0.0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency Orwoll.BI0056.BI
BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 23.1 0.0 0.0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency Orwoll.BI0131.BI
BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 27.3 0.0 0.0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency Orwoll.BI0153.BI
BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 33 0.0 0.0 1: Yes 1: Yes 33.0 50.2 4:FALL sufficiency Orwoll.BI0215.BI

5 rows × 64 columns


In [9]:
mf.head()


Out[9]:
BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model Title ... OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus Description
#SampleID
BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 25.8 0.0 0.0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency Orwoll.BI0023.BI
BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 39.2 0.0 0.0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency Orwoll.BI0056.BI
BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 23.1 0.0 0.0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency Orwoll.BI0131.BI
BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 27.3 0.0 0.0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency Orwoll.BI0153.BI
BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 33 0.0 0.0 1: Yes 1: Yes 33.0 50.2 4:FALL sufficiency Orwoll.BI0215.BI

5 rows × 64 columns


In [10]:
!pwd


/Users/serene/Knight/VitaminD/MrOS_VitaminD/notebooks

In [108]:
dat1 = pd.read_csv('../../1.1_cd_20k/taxa_plots/taxa_summary_plots/raw_data/table_mc5876_sorted_L2.txt', 
                   skiprows=1, sep='\t', dtype=str, index_col = '#OTU ID')

In [109]:
dat1.head()


Out[109]:
BI0023 BI0056 BI0131 BI0153 BI0215 BI0353 BI0371 BI0372 BI0380 BI0389 ... SD8964 SD8966 SD8972 SD8973 SD8985 SD8996 SD8999 SD9001 SD9003 SD9009
#OTU ID
Unassigned;Other 0.00299015548809 0.0 9.61261174661e-05 0.0 0.0 0.0 0.000276518084283 0.00083811488621 0.0 0.0 ... 0.0 0.0 0.000346040717458 0.00404538727183 0.0 0.0 0.00166008357662 0.0 0.0 0.0
k__Archaea;p__Euryarchaeota 0.000276014352746 0.00010055304173 9.61261174661e-05 0.0 0.0 0.0 0.000331821701139 0.0 0.000159650896706 0.0 ... 0.0 0.0 0.000230693811639 0.000888011840158 0.0 0.0 0.0 0.0 0.00012025012025 0.0
k__Bacteria;p__Actinobacteria 0.00740638513203 0.000703871292107 0.000240315293665 0.00693939594211 0.000622629761703 9.95173408967e-05 0.00143789403827 0.00477080781381 0.000585386621255 0.0 ... 0.00326422051623 0.004073858526 0.000403714170367 0.00236803157375 0.00689437065149 0.00164134060465 0.00217528192799 0.0 0.00210437710438 0.0116162877842
k__Bacteria;p__Bacteroidetes 0.45496365811 0.79069884364 0.283572046525 0.522040843302 0.548876436294 0.529083942877 0.461619289902 0.631874153826 0.385823000373 0.640941928313 ... 0.666445022064 0.000846516057351 0.569698367841 0.277355698076 0.460215053763 0.553555355536 0.67588299273 0.739278420694 0.474326599327 0.480139895079
k__Bacteria;p__Cyanobacteria 0.0 0.0 0.0 0.0 0.0186788928511 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.00105892942235 0.0 0.0 0.0 0.0

5 rows × 599 columns


In [110]:
dat1.shape


Out[110]:
(14, 599)

In [112]:
dat1.index


Out[112]:
Index(['Unassigned;Other', 'k__Archaea;p__Euryarchaeota',
       'k__Bacteria;p__Actinobacteria', 'k__Bacteria;p__Bacteroidetes',
       'k__Bacteria;p__Cyanobacteria', 'k__Bacteria;p__Elusimicrobia',
       'k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Fusobacteria',
       'k__Bacteria;p__Lentisphaerae', 'k__Bacteria;p__Proteobacteria',
       'k__Bacteria;p__Spirochaetes', 'k__Bacteria;p__Synergistetes',
       'k__Bacteria;p__Tenericutes', 'k__Bacteria;p__Verrucomicrobia'],
      dtype='object', name='#OTU ID')

In [85]:
dat1 = dat1.transpose()

In [86]:
dat1.head()


Out[86]:
#OTU ID Unassigned;Other k__Archaea;p__Euryarchaeota k__Bacteria;p__Actinobacteria k__Bacteria;p__Bacteroidetes k__Bacteria;p__Cyanobacteria k__Bacteria;p__Elusimicrobia k__Bacteria;p__Firmicutes k__Bacteria;p__Fusobacteria k__Bacteria;p__Lentisphaerae k__Bacteria;p__Proteobacteria k__Bacteria;p__Spirochaetes k__Bacteria;p__Synergistetes k__Bacteria;p__Tenericutes k__Bacteria;p__Verrucomicrobia
BI0023 0.002990 0.000276 0.007406 0.454964 0.000000 0.0 0.481185 0.0 0.0 0.010443 0.0 0.000092 0.010259 0.032386
BI0056 0.000000 0.000101 0.000704 0.790699 0.000000 0.0 0.158371 0.0 0.0 0.049422 0.0 0.000000 0.000151 0.000553
BI0131 0.000096 0.000096 0.000240 0.283572 0.000000 0.0 0.237864 0.0 0.0 0.459387 0.0 0.000000 0.008075 0.010670
BI0153 0.000000 0.000000 0.006939 0.522041 0.000000 0.0 0.453110 0.0 0.0 0.016258 0.0 0.000000 0.000000 0.001652
BI0215 0.000000 0.000000 0.000623 0.548876 0.018679 0.0 0.393276 0.0 0.0 0.030339 0.0 0.000000 0.000000 0.008207

In [104]:
dat1.columns


Out[104]:
Index(['Unassigned;Other', 'k__Archaea;p__Euryarchaeota',
       'k__Bacteria;p__Actinobacteria', 'k__Bacteria;p__Bacteroidetes',
       'k__Bacteria;p__Cyanobacteria', 'k__Bacteria;p__Elusimicrobia',
       'k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Fusobacteria',
       'k__Bacteria;p__Lentisphaerae', 'k__Bacteria;p__Proteobacteria',
       'k__Bacteria;p__Spirochaetes', 'k__Bacteria;p__Synergistetes',
       'k__Bacteria;p__Tenericutes', 'k__Bacteria;p__Verrucomicrobia'],
      dtype='object', name='#OTU ID')

In [88]:
dat = dat1.loc[:, ['k__Bacteria;p__Firmicutes']]

In [89]:
df = pd.merge(mf, dat, left_index=True, right_index=True)

In [90]:
dat.shape


Out[90]:
(599, 1)

In [91]:
mf.shape


Out[91]:
(599, 64)

In [92]:
df.shape


Out[92]:
(599, 65)

In [93]:
df.head()


Out[93]:
BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model Title ... OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus Description k__Bacteria;p__Firmicutes
BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 0.0 0.0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency Orwoll.BI0023.BI 0.481185
BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 0.0 0.0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency Orwoll.BI0056.BI 0.158371
BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 0.0 0.0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency Orwoll.BI0131.BI 0.237864
BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 0.0 0.0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency Orwoll.BI0153.BI 0.453110
BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 0.0 0.0 1: Yes 1: Yes 33.0 50.2 4:FALL sufficiency Orwoll.BI0215.BI 0.393276

5 rows × 65 columns


In [115]:
table = df[['OHV1D3', 'OHVD3', 'k__Bacteria;p__Firmicutes']]
table = table.rename(columns = {'k__Bacteria;p__Firmicutes':'Firmicutes'})

In [116]:
table = table.apply(pd.to_numeric, errors='coerce')

In [117]:
table.head()


Out[117]:
OHV1D3 OHVD3 Firmicutes
BI0023 39.3 25.8 0.481185
BI0056 61.9 39.2 0.158371
BI0131 52.1 23.1 0.237864
BI0153 43.1 27.3 0.453110
BI0215 50.2 33.0 0.393276

In [143]:
table.to_csv('firmicutes.txt', sep='\t')

In [118]:
import seaborn as sns; sns.set(color_codes=True)
ax = sns.regplot(x="OHV1D3", y="Firmicutes", data=table)



In [103]:
ax = sns.regplot(x="OHVD3", y="k__Bacteria;p__Firmicutes", data=table)



In [142]:
import statsmodels.api as sm
y = table[["Firmicutes"]]
X = table[['OHV1D3', 'OHVD3']]

model = sm.OLS(y, X, missing='drop')
results = model.fit()
# Statsmodels gives R-like statistical output
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Firmicutes   R-squared:                       0.804
Model:                            OLS   Adj. R-squared:                  0.804
Method:                 Least Squares   F-statistic:                     1139.
Date:                Tue, 15 Aug 2017   Prob (F-statistic):          5.36e-197
Time:                        10:33:38   Log-Likelihood:                 135.15
No. Observations:                 556   AIC:                            -266.3
Df Residuals:                     554   BIC:                            -257.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
OHV1D3         0.0039      0.000     10.136      0.000       0.003       0.005
OHVD3          0.0041      0.001      6.663      0.000       0.003       0.005
==============================================================================
Omnibus:                        5.996   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.050   Jarque-Bera (JB):                6.306
Skew:                           0.179   Prob(JB):                       0.0427
Kurtosis:                       3.380   Cond. No.                         6.32
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [145]:
y = table[["Firmicutes"]]
X = table[['OHV1D3']]

model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Firmicutes   R-squared:                       0.791
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     2140.
Date:                Tue, 15 Aug 2017   Prob (F-statistic):          1.93e-194
Time:                        10:54:42   Log-Likelihood:                 118.02
No. Observations:                 567   AIC:                            -234.0
Df Residuals:                     566   BIC:                            -229.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
OHV1D3         0.0063      0.000     46.257      0.000       0.006       0.007
==============================================================================
Omnibus:                       10.634   Durbin-Watson:                   2.001
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               16.462
Skew:                           0.117   Prob(JB):                     0.000266
Kurtosis:                       3.801   Cond. No.                         1.00
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [146]:
y = table[["Firmicutes"]]
X = table[['OHVD3']]

model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Firmicutes   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     1838.
Date:                Tue, 15 Aug 2017   Prob (F-statistic):          2.91e-178
Time:                        10:55:12   Log-Likelihood:                 87.851
No. Observations:                 556   AIC:                            -173.7
Df Residuals:                     555   BIC:                            -169.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
OHVD3          0.0101      0.000     42.874      0.000       0.010       0.011
==============================================================================
Omnibus:                        5.118   Durbin-Watson:                   1.952
Prob(Omnibus):                  0.077   Jarque-Bera (JB):                6.323
Skew:                          -0.075   Prob(JB):                       0.0424
Kurtosis:                       3.500   Cond. No.                         1.00
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]: