In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [8]:
mf = pd.read_csv('mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID', na_values='Missing:Not collected')
In [80]:
mf.head()
Out[80]:
BarcodeSequence
LinkerPrimerSequence
Experiment_Design_Description
Library_Construction_Protocol
Linker
Platform
Center_Name
Center_Project
Instrument_Model
Title
...
OHVD3
OHVD2
OHV1D2
OHV1D2CT
OHVD2CT
OHVDTOT
OHV1DTOT
OHSEAS
VDstatus
Description
#SampleID
BI0023
TCTGGTGACATT
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
25.8
0.0
0.0
1: Yes
1: Yes
25.8
39.3
3:SUMMER
sufficiency
Orwoll.BI0023.BI
BI0056
CAAGCATGCCTA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
39.2
0.0
0.0
1: Yes
1: Yes
39.2
61.9
2:SPRING
sufficiency
Orwoll.BI0056.BI
BI0131
CTATTTGCGACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
23.1
0.0
0.0
1: Yes
1: Yes
23.1
52.1
2:SPRING
sufficiency
Orwoll.BI0131.BI
BI0153
ATCGGCGTTACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
27.3
0.0
0.0
1: Yes
1: Yes
27.3
43.1
2:SPRING
sufficiency
Orwoll.BI0153.BI
BI0215
CCTCTCGTGATC
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
33
0.0
0.0
1: Yes
1: Yes
33.0
50.2
4:FALL
sufficiency
Orwoll.BI0215.BI
5 rows × 64 columns
In [9]:
mf.head()
Out[9]:
BarcodeSequence
LinkerPrimerSequence
Experiment_Design_Description
Library_Construction_Protocol
Linker
Platform
Center_Name
Center_Project
Instrument_Model
Title
...
OHVD3
OHVD2
OHV1D2
OHV1D2CT
OHVD2CT
OHVDTOT
OHV1DTOT
OHSEAS
VDstatus
Description
#SampleID
BI0023
TCTGGTGACATT
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
25.8
0.0
0.0
1: Yes
1: Yes
25.8
39.3
3:SUMMER
sufficiency
Orwoll.BI0023.BI
BI0056
CAAGCATGCCTA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
39.2
0.0
0.0
1: Yes
1: Yes
39.2
61.9
2:SPRING
sufficiency
Orwoll.BI0056.BI
BI0131
CTATTTGCGACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
23.1
0.0
0.0
1: Yes
1: Yes
23.1
52.1
2:SPRING
sufficiency
Orwoll.BI0131.BI
BI0153
ATCGGCGTTACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
27.3
0.0
0.0
1: Yes
1: Yes
27.3
43.1
2:SPRING
sufficiency
Orwoll.BI0153.BI
BI0215
CCTCTCGTGATC
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
33
0.0
0.0
1: Yes
1: Yes
33.0
50.2
4:FALL
sufficiency
Orwoll.BI0215.BI
5 rows × 64 columns
In [10]:
!pwd
/Users/serene/Knight/VitaminD/MrOS_VitaminD/notebooks
In [108]:
dat1 = pd.read_csv('../../1.1_cd_20k/taxa_plots/taxa_summary_plots/raw_data/table_mc5876_sorted_L2.txt',
skiprows=1, sep='\t', dtype=str, index_col = '#OTU ID')
In [109]:
dat1.head()
Out[109]:
BI0023
BI0056
BI0131
BI0153
BI0215
BI0353
BI0371
BI0372
BI0380
BI0389
...
SD8964
SD8966
SD8972
SD8973
SD8985
SD8996
SD8999
SD9001
SD9003
SD9009
#OTU ID
Unassigned;Other
0.00299015548809
0.0
9.61261174661e-05
0.0
0.0
0.0
0.000276518084283
0.00083811488621
0.0
0.0
...
0.0
0.0
0.000346040717458
0.00404538727183
0.0
0.0
0.00166008357662
0.0
0.0
0.0
k__Archaea;p__Euryarchaeota
0.000276014352746
0.00010055304173
9.61261174661e-05
0.0
0.0
0.0
0.000331821701139
0.0
0.000159650896706
0.0
...
0.0
0.0
0.000230693811639
0.000888011840158
0.0
0.0
0.0
0.0
0.00012025012025
0.0
k__Bacteria;p__Actinobacteria
0.00740638513203
0.000703871292107
0.000240315293665
0.00693939594211
0.000622629761703
9.95173408967e-05
0.00143789403827
0.00477080781381
0.000585386621255
0.0
...
0.00326422051623
0.004073858526
0.000403714170367
0.00236803157375
0.00689437065149
0.00164134060465
0.00217528192799
0.0
0.00210437710438
0.0116162877842
k__Bacteria;p__Bacteroidetes
0.45496365811
0.79069884364
0.283572046525
0.522040843302
0.548876436294
0.529083942877
0.461619289902
0.631874153826
0.385823000373
0.640941928313
...
0.666445022064
0.000846516057351
0.569698367841
0.277355698076
0.460215053763
0.553555355536
0.67588299273
0.739278420694
0.474326599327
0.480139895079
k__Bacteria;p__Cyanobacteria
0.0
0.0
0.0
0.0
0.0186788928511
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.00105892942235
0.0
0.0
0.0
0.0
5 rows × 599 columns
In [110]:
dat1.shape
Out[110]:
(14, 599)
In [112]:
dat1.index
Out[112]:
Index(['Unassigned;Other', 'k__Archaea;p__Euryarchaeota',
'k__Bacteria;p__Actinobacteria', 'k__Bacteria;p__Bacteroidetes',
'k__Bacteria;p__Cyanobacteria', 'k__Bacteria;p__Elusimicrobia',
'k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Fusobacteria',
'k__Bacteria;p__Lentisphaerae', 'k__Bacteria;p__Proteobacteria',
'k__Bacteria;p__Spirochaetes', 'k__Bacteria;p__Synergistetes',
'k__Bacteria;p__Tenericutes', 'k__Bacteria;p__Verrucomicrobia'],
dtype='object', name='#OTU ID')
In [85]:
dat1 = dat1.transpose()
In [86]:
dat1.head()
Out[86]:
#OTU ID
Unassigned;Other
k__Archaea;p__Euryarchaeota
k__Bacteria;p__Actinobacteria
k__Bacteria;p__Bacteroidetes
k__Bacteria;p__Cyanobacteria
k__Bacteria;p__Elusimicrobia
k__Bacteria;p__Firmicutes
k__Bacteria;p__Fusobacteria
k__Bacteria;p__Lentisphaerae
k__Bacteria;p__Proteobacteria
k__Bacteria;p__Spirochaetes
k__Bacteria;p__Synergistetes
k__Bacteria;p__Tenericutes
k__Bacteria;p__Verrucomicrobia
BI0023
0.002990
0.000276
0.007406
0.454964
0.000000
0.0
0.481185
0.0
0.0
0.010443
0.0
0.000092
0.010259
0.032386
BI0056
0.000000
0.000101
0.000704
0.790699
0.000000
0.0
0.158371
0.0
0.0
0.049422
0.0
0.000000
0.000151
0.000553
BI0131
0.000096
0.000096
0.000240
0.283572
0.000000
0.0
0.237864
0.0
0.0
0.459387
0.0
0.000000
0.008075
0.010670
BI0153
0.000000
0.000000
0.006939
0.522041
0.000000
0.0
0.453110
0.0
0.0
0.016258
0.0
0.000000
0.000000
0.001652
BI0215
0.000000
0.000000
0.000623
0.548876
0.018679
0.0
0.393276
0.0
0.0
0.030339
0.0
0.000000
0.000000
0.008207
In [104]:
dat1.columns
Out[104]:
Index(['Unassigned;Other', 'k__Archaea;p__Euryarchaeota',
'k__Bacteria;p__Actinobacteria', 'k__Bacteria;p__Bacteroidetes',
'k__Bacteria;p__Cyanobacteria', 'k__Bacteria;p__Elusimicrobia',
'k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Fusobacteria',
'k__Bacteria;p__Lentisphaerae', 'k__Bacteria;p__Proteobacteria',
'k__Bacteria;p__Spirochaetes', 'k__Bacteria;p__Synergistetes',
'k__Bacteria;p__Tenericutes', 'k__Bacteria;p__Verrucomicrobia'],
dtype='object', name='#OTU ID')
In [88]:
dat = dat1.loc[:, ['k__Bacteria;p__Firmicutes']]
In [89]:
df = pd.merge(mf, dat, left_index=True, right_index=True)
In [90]:
dat.shape
Out[90]:
(599, 1)
In [91]:
mf.shape
Out[91]:
(599, 64)
In [92]:
df.shape
Out[92]:
(599, 65)
In [93]:
df.head()
Out[93]:
BarcodeSequence
LinkerPrimerSequence
Experiment_Design_Description
Library_Construction_Protocol
Linker
Platform
Center_Name
Center_Project
Instrument_Model
Title
...
OHVD2
OHV1D2
OHV1D2CT
OHVD2CT
OHVDTOT
OHV1DTOT
OHSEAS
VDstatus
Description
k__Bacteria;p__Firmicutes
BI0023
TCTGGTGACATT
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
0.0
0.0
1: Yes
1: Yes
25.8
39.3
3:SUMMER
sufficiency
Orwoll.BI0023.BI
0.481185
BI0056
CAAGCATGCCTA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
0.0
0.0
1: Yes
1: Yes
39.2
61.9
2:SPRING
sufficiency
Orwoll.BI0056.BI
0.158371
BI0131
CTATTTGCGACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
0.0
0.0
1: Yes
1: Yes
23.1
52.1
2:SPRING
sufficiency
Orwoll.BI0131.BI
0.237864
BI0153
ATCGGCGTTACA
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
0.0
0.0
1: Yes
1: Yes
27.3
43.1
2:SPRING
sufficiency
Orwoll.BI0153.BI
0.453110
BI0215
CCTCTCGTGATC
GGACTACHVGGGTWTCTAAT
16S stool samples sequenced for MrOS Vitamin D...
16S rRNA v4
GT
Illumina
BI
MrOS
Illumina MiSeq
MrOS_VitaminD
...
0.0
0.0
1: Yes
1: Yes
33.0
50.2
4:FALL
sufficiency
Orwoll.BI0215.BI
0.393276
5 rows × 65 columns
In [115]:
table = df[['OHV1D3', 'OHVD3', 'k__Bacteria;p__Firmicutes']]
table = table.rename(columns = {'k__Bacteria;p__Firmicutes':'Firmicutes'})
In [116]:
table = table.apply(pd.to_numeric, errors='coerce')
In [117]:
table.head()
Out[117]:
OHV1D3
OHVD3
Firmicutes
BI0023
39.3
25.8
0.481185
BI0056
61.9
39.2
0.158371
BI0131
52.1
23.1
0.237864
BI0153
43.1
27.3
0.453110
BI0215
50.2
33.0
0.393276
In [143]:
table.to_csv('firmicutes.txt', sep='\t')
In [118]:
import seaborn as sns; sns.set(color_codes=True)
ax = sns.regplot(x="OHV1D3", y="Firmicutes", data=table)
In [103]:
ax = sns.regplot(x="OHVD3", y="k__Bacteria;p__Firmicutes", data=table)
In [142]:
import statsmodels.api as sm
y = table[["Firmicutes"]]
X = table[['OHV1D3', 'OHVD3']]
model = sm.OLS(y, X, missing='drop')
results = model.fit()
# Statsmodels gives R-like statistical output
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Firmicutes R-squared: 0.804
Model: OLS Adj. R-squared: 0.804
Method: Least Squares F-statistic: 1139.
Date: Tue, 15 Aug 2017 Prob (F-statistic): 5.36e-197
Time: 10:33:38 Log-Likelihood: 135.15
No. Observations: 556 AIC: -266.3
Df Residuals: 554 BIC: -257.7
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
OHV1D3 0.0039 0.000 10.136 0.000 0.003 0.005
OHVD3 0.0041 0.001 6.663 0.000 0.003 0.005
==============================================================================
Omnibus: 5.996 Durbin-Watson: 2.014
Prob(Omnibus): 0.050 Jarque-Bera (JB): 6.306
Skew: 0.179 Prob(JB): 0.0427
Kurtosis: 3.380 Cond. No. 6.32
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [145]:
y = table[["Firmicutes"]]
X = table[['OHV1D3']]
model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Firmicutes R-squared: 0.791
Model: OLS Adj. R-squared: 0.790
Method: Least Squares F-statistic: 2140.
Date: Tue, 15 Aug 2017 Prob (F-statistic): 1.93e-194
Time: 10:54:42 Log-Likelihood: 118.02
No. Observations: 567 AIC: -234.0
Df Residuals: 566 BIC: -229.7
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
OHV1D3 0.0063 0.000 46.257 0.000 0.006 0.007
==============================================================================
Omnibus: 10.634 Durbin-Watson: 2.001
Prob(Omnibus): 0.005 Jarque-Bera (JB): 16.462
Skew: 0.117 Prob(JB): 0.000266
Kurtosis: 3.801 Cond. No. 1.00
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [146]:
y = table[["Firmicutes"]]
X = table[['OHVD3']]
model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: Firmicutes R-squared: 0.768
Model: OLS Adj. R-squared: 0.768
Method: Least Squares F-statistic: 1838.
Date: Tue, 15 Aug 2017 Prob (F-statistic): 2.91e-178
Time: 10:55:12 Log-Likelihood: 87.851
No. Observations: 556 AIC: -173.7
Df Residuals: 555 BIC: -169.4
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
OHVD3 0.0101 0.000 42.874 0.000 0.010 0.011
==============================================================================
Omnibus: 5.118 Durbin-Watson: 1.952
Prob(Omnibus): 0.077 Jarque-Bera (JB): 6.323
Skew: -0.075 Prob(JB): 0.0424
Kurtosis: 3.500 Cond. No. 1.00
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
Content source: serenejiang/MrOS_VitaminD
Similar notebooks: