In [1]:
import pandas as pd

Figure 1 csv data generation

Figure data consolidation for Figure 1, which maps samples and shows distribution across EMPO categories

Figure 1a and 1b

for these figure, we just need the samples, EMPO level categories, and lat/lon coordinates


In [3]:
# Load up metadata map

metadata_fp = '../../../data/mapping-files/emp_qiime_mapping_qc_filtered.tsv'

metadata = pd.read_csv(metadata_fp, header=0, sep='\t')

metadata.head()


Out[3]:
#SampleID BarcodeSequence LinkerPrimerSequence Description host_subject_id study_id title principal_investigator doi ebi_accession ... adiv_shannon adiv_faith_pd temperature_deg_c ph salinity_psu oxygen_mg_per_l phosphate_umol_per_l ammonium_umol_per_l nitrate_umol_per_l sulfate_umol_per_l
0 550.L1S1.s.1.sequence AACGCACGCTAG GTGCCAGCMGCCGCGGTAA sample_1 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 4.244831 13.631804 NaN NaN NaN NaN NaN NaN NaN NaN
1 550.L1S10.s.1.sequence ACAGACCACTCA GTGCCAGCMGCCGCGGTAA sample_2 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.027416 9.425835 NaN NaN NaN NaN NaN NaN NaN NaN
2 550.L1S100.s.1.sequence ATGCACTGGCGA GTGCCAGCMGCCGCGGTAA sample_3 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.196420 10.491161 NaN NaN NaN NaN NaN NaN NaN NaN
3 550.L1S101.s.1.sequence ATTATCGTGCAC GTGCCAGCMGCCGCGGTAA sample_4 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.714719 11.384689 NaN NaN NaN NaN NaN NaN NaN NaN
4 550.L1S102.s.1.sequence CACGACAGGCTA GTGCCAGCMGCCGCGGTAA sample_5 stool F4 550 Moving pictures of the human microbiome Rob Knight 10.1186/gb-2011-12-5-r50 ERP021896 ... 3.969038 15.162691 NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 76 columns


In [4]:
metadata.columns


Out[4]:
Index(['#SampleID', 'BarcodeSequence', 'LinkerPrimerSequence', 'Description',
       'host_subject_id', 'study_id', 'title', 'principal_investigator', 'doi',
       'ebi_accession', 'target_gene', 'target_subfragment', 'pcr_primers',
       'illumina_technology', 'extraction_center', 'run_center', 'run_date',
       'read_length_bp', 'sequences_split_libraries',
       'observations_closed_ref_greengenes', 'observations_closed_ref_silva',
       'observations_open_ref_greengenes', 'observations_deblur_90bp',
       'observations_deblur_100bp', 'observations_deblur_150bp',
       'emp_release1', 'qc_filtered', 'subset_10k', 'subset_5k', 'subset_2k',
       'sample_taxid', 'sample_scientific_name', 'host_taxid',
       'host_common_name_provided', 'host_common_name', 'host_scientific_name',
       'host_superkingdom', 'host_kingdom', 'host_phylum', 'host_class',
       'host_order', 'host_family', 'host_genus', 'host_species',
       'collection_timestamp', 'country', 'latitude_deg', 'longitude_deg',
       'depth_m', 'altitude_m', 'elevation_m', 'env_biome', 'env_feature',
       'env_material', 'envo_biome_0', 'envo_biome_1', 'envo_biome_2',
       'envo_biome_3', 'envo_biome_4', 'envo_biome_5', 'empo_0', 'empo_1',
       'empo_2', 'empo_3', 'adiv_observed_otus', 'adiv_chao1', 'adiv_shannon',
       'adiv_faith_pd', 'temperature_deg_c', 'ph', 'salinity_psu',
       'oxygen_mg_per_l', 'phosphate_umol_per_l', 'ammonium_umol_per_l',
       'nitrate_umol_per_l', 'sulfate_umol_per_l'],
      dtype='object')

In [6]:
# take just the columns we need for this figure panel

fig1ab = metadata.loc[:,['#SampleID','empo_0','empo_1','empo_2','empo_3','latitude_deg','longitude_deg']]
fig1ab.head()


Out[6]:
#SampleID empo_0 empo_1 empo_2 empo_3 latitude_deg longitude_deg
0 550.L1S1.s.1.sequence EMP sample Host-associated Animal Animal distal gut 40.015 -105.271
1 550.L1S10.s.1.sequence EMP sample Host-associated Animal Animal distal gut 40.015 -105.271
2 550.L1S100.s.1.sequence EMP sample Host-associated Animal Animal distal gut 40.015 -105.271
3 550.L1S101.s.1.sequence EMP sample Host-associated Animal Animal distal gut 40.015 -105.271
4 550.L1S102.s.1.sequence EMP sample Host-associated Animal Animal distal gut 40.015 -105.271

Write to Excel notebook


In [7]:
fig1 = pd.ExcelWriter('Figure1_data.xlsx')

fig1ab.to_excel(fig1,'Fig-1ab')

fig1.save()