In [1]:
import pandas as pd

Figure 2 csv data generation

Figure data consolidation for Figure 2, which deals with alpha and beta diversity of samples

Figure 2a: alpha diversity plot

For this figure, we need to output an Excel sheet that contains the following columns parsed from the universal metadata file:

  1. observed sequences
  2. Empo level 3

In [2]:
# Load up metadata map

metadata_fp = '../../../data/mapping-files/emp_qiime_mapping_qc_filtered.tsv'

metadata = pd.read_csv(metadata_fp, header=0, sep='\t')

metadata.head()


Out[2]:
#SampleID BarcodeSequence LinkerPrimerSequence Description host_subject_id study_id title principal_investigator doi ebi_accession ... adiv_shannon adiv_faith_pd temperature_deg_c ph salinity_psu oxygen_mg_per_l phosphate_umol_per_l ammonium_umol_per_l nitrate_umol_per_l sulfate_umol_per_l
0 550.L1S1.s.1.sequence AACGCACGCTAG GTGCCAGCMGCCGCGGTAA sample_1 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 4.244831 13.631804 NaN NaN NaN NaN NaN NaN NaN NaN
1 550.L1S10.s.1.sequence ACAGACCACTCA GTGCCAGCMGCCGCGGTAA sample_2 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.027416 9.425835 NaN NaN NaN NaN NaN NaN NaN NaN
2 550.L1S100.s.1.sequence ATGCACTGGCGA GTGCCAGCMGCCGCGGTAA sample_3 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.196420 10.491161 NaN NaN NaN NaN NaN NaN NaN NaN
3 550.L1S101.s.1.sequence ATTATCGTGCAC GTGCCAGCMGCCGCGGTAA sample_4 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.714719 11.384689 NaN NaN NaN NaN NaN NaN NaN NaN
4 550.L1S102.s.1.sequence CACGACAGGCTA GTGCCAGCMGCCGCGGTAA sample_5 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.969038 15.162691 NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 76 columns


In [3]:
metadata.columns


Out[3]:
Index(['#SampleID', 'BarcodeSequence', 'LinkerPrimerSequence', 'Description',
       'host_subject_id', 'study_id', 'title', 'principal_investigator', 'doi',
       'ebi_accession', 'target_gene', 'target_subfragment', 'pcr_primers',
       'illumina_technology', 'extraction_center', 'run_center', 'run_date',
       'read_length_bp', 'sequences_split_libraries',
       'observations_closed_ref_greengenes', 'observations_closed_ref_silva',
       'observations_open_ref_greengenes', 'observations_deblur_90bp',
       'observations_deblur_100bp', 'observations_deblur_150bp',
       'emp_release1', 'qc_filtered', 'subset_10k', 'subset_5k', 'subset_2k',
       'sample_taxid', 'sample_scientific_name', 'host_taxid',
       'host_common_name_provided', 'host_common_name', 'host_scientific_name',
       'host_superkingdom', 'host_kingdom', 'host_phylum', 'host_class',
       'host_order', 'host_family', 'host_genus', 'host_species',
       'collection_timestamp', 'country', 'latitude_deg', 'longitude_deg',
       'depth_m', 'altitude_m', 'elevation_m', 'env_biome', 'env_feature',
       'env_material', 'envo_biome_0', 'envo_biome_1', 'envo_biome_2',
       'envo_biome_3', 'envo_biome_4', 'envo_biome_5', 'empo_0', 'empo_1',
       'empo_2', 'empo_3', 'adiv_observed_otus', 'adiv_chao1', 'adiv_shannon',
       'adiv_faith_pd', 'temperature_deg_c', 'ph', 'salinity_psu',
       'oxygen_mg_per_l', 'phosphate_umol_per_l', 'ammonium_umol_per_l',
       'nitrate_umol_per_l', 'sulfate_umol_per_l'],
      dtype='object')

In [4]:
# take just the columns we need for this figure panel

fig2a = metadata.loc[:,['#SampleID','empo_1','empo_3','adiv_observed_otus']]
fig2a.head()


Out[4]:
#SampleID empo_1 empo_3 adiv_observed_otus
0 550.L1S1.s.1.sequence Host-associated Animal distal gut 84
1 550.L1S10.s.1.sequence Host-associated Animal distal gut 68
2 550.L1S100.s.1.sequence Host-associated Animal distal gut 80
3 550.L1S101.s.1.sequence Host-associated Animal distal gut 80
4 550.L1S102.s.1.sequence Host-associated Animal distal gut 108

Figure 2b: alpha diversity by temperature and pH

For this figure, we need to output an Excel sheet that contains the following columns parsed from the universal metadata file:

  1. observed sequences
  2. pH
  3. temperature

In [5]:
# take just the columns we need for this figure panel, and drop values with more than one NaN

fig2b = metadata.loc[:,['#SampleID','temperature_deg_c','ph','adiv_observed_otus']].dropna(thresh=3)
fig2b.head()


Out[5]:
#SampleID temperature_deg_c ph adiv_observed_otus
1537 632.Agricultural.soil.soy NaN 7.6 1019
1538 632.Agricultural.soil.wheat NaN 7.4 1090
1539 632.Arctic.Tundra.1 NaN 3.9 671
1540 632.Arctic.Tundra.2 NaN 6.7 757
1541 632.Boreal.coniferous.forest NaN 4.6 617

Figure 2c: beta diversity PCoA

For this figure, we need to output an Excel sheet that contains PCoA coordinate values merged with EMPO information


In [6]:
pcoa = pd.read_csv('./emp_90_gg_1k_unweighted_unifrac.txt.pc.first_ten',
                   header=None,
                   sep='\t',
                   names = ['#SampleID','PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10'])

empo = metadata.loc[:,['#SampleID','empo_0','empo_1','empo_2','empo_3']]

fig2c = pd.merge(left = empo, right = pcoa)
fig2c.head()


Out[6]:
#SampleID empo_0 empo_1 empo_2 empo_3 PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
0 550.L1S1.s.1.sequence EMP sample Host-associated Animal Animal distal gut -0.296317 0.077773 0.057479 0.019484 -0.028889 0.113648 -0.084346 0.134690 -0.033597 0.017126
1 550.L1S10.s.1.sequence EMP sample Host-associated Animal Animal distal gut -0.278095 0.070993 -0.050654 0.096955 0.029117 0.123500 -0.140389 0.247012 -0.081065 0.016625
2 550.L1S100.s.1.sequence EMP sample Host-associated Animal Animal distal gut -0.219183 0.109322 -0.108332 0.075268 0.035042 0.111289 -0.154783 0.314899 -0.106667 0.031718
3 550.L1S101.s.1.sequence EMP sample Host-associated Animal Animal distal gut -0.243843 0.138947 -0.118037 0.070988 0.036605 0.092981 -0.163341 0.327321 -0.117587 0.049399
4 550.L1S102.s.1.sequence EMP sample Host-associated Animal Animal distal gut -0.241271 0.169543 -0.037585 0.005261 -0.031439 0.138188 -0.134458 0.277817 -0.085011 0.016720

Figure 2d: Estimated rRNA operon copy number by environment

For this figure, we will output each sample as a separate row with calculated rRNA info, plus metadata


In [9]:
# load in rRNA info

fig2d = pd.read_csv('../../../data/predicted-rrna-copy-number/emp_rrna_averagecopy_empo.csv')

fig2d.head()


Out[9]:
#SampleID #SampleID.1 empo_0 empo_1 empo_2 empo_3 averagecopy
0 1001.SKB1 NaN EMP sample Free-living Non-saline Soil (non-saline) 2.688339
1 1001.SKB2 NaN EMP sample Free-living Non-saline Soil (non-saline) 2.722114
2 1001.SKB3 NaN EMP sample Free-living Non-saline Soil (non-saline) 2.776446
3 1001.SKB4 NaN EMP sample Free-living Non-saline Soil (non-saline) 2.976232
4 1001.SKB5 NaN EMP sample Free-living Non-saline Soil (non-saline) NaN

Write to Excel notebook


In [8]:
fig2 = pd.ExcelWriter('Figure2_data.xlsx')

fig2a.to_excel(fig2,'Fig-2a')
fig2b.to_excel(fig2,'Fig-2b')
fig2c.to_excel(fig2,'Fig-2c')
fig2d.to_excel(fig2,'Fig-2d')

fig2.save()

In [ ]: