In [1]:

    
import pandas as pd

Figure 1 csv data generation

Figure data consolidation for Figure 1, which maps samples and shows distribution across EMPO categories

Figure 1a and 1b

for these figure, we just need the samples, EMPO level categories, and lat/lon coordinates



In [3]:

    
# Load up metadata map

metadata_fp = '../../../data/mapping-files/emp_qiime_mapping_qc_filtered.tsv'

metadata = pd.read_csv(metadata_fp, header=0, sep='\t')

metadata.head()









    Out[3]:







  
    
      
      #SampleID
      BarcodeSequence
      LinkerPrimerSequence
      Description
      host_subject_id
      study_id
      title
      principal_investigator
      doi
      ebi_accession
      ...
      adiv_shannon
      adiv_faith_pd
      temperature_deg_c
      ph
      salinity_psu
      oxygen_mg_per_l
      phosphate_umol_per_l
      ammonium_umol_per_l
      nitrate_umol_per_l
      sulfate_umol_per_l
    
  
  
    
      0
      550.L1S1.s.1.sequence
      AACGCACGCTAG
      GTGCCAGCMGCCGCGGTAA
      sample_1 stool
      F4
      550
      Moving pictures of the human microbiome
      Rob Knight
      10.1186/gb-2011-12-5-r50
      ERP021896
      ...
      4.244831
      13.631804
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      550.L1S10.s.1.sequence
      ACAGACCACTCA
      GTGCCAGCMGCCGCGGTAA
      sample_2 stool
      F4
      550
      Moving pictures of the human microbiome
      Rob Knight
      10.1186/gb-2011-12-5-r50
      ERP021896
      ...
      3.027416
      9.425835
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      550.L1S100.s.1.sequence
      ATGCACTGGCGA
      GTGCCAGCMGCCGCGGTAA
      sample_3 stool
      F4
      550
      Moving pictures of the human microbiome
      Rob Knight
      10.1186/gb-2011-12-5-r50
      ERP021896
      ...
      3.196420
      10.491161
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      550.L1S101.s.1.sequence
      ATTATCGTGCAC
      GTGCCAGCMGCCGCGGTAA
      sample_4 stool
      F4
      550
      Moving pictures of the human microbiome
      Rob Knight
      10.1186/gb-2011-12-5-r50
      ERP021896
      ...
      3.714719
      11.384689
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      550.L1S102.s.1.sequence
      CACGACAGGCTA
      GTGCCAGCMGCCGCGGTAA
      sample_5 stool
      F4
      550
      Moving pictures of the human microbiome
      Rob Knight
      10.1186/gb-2011-12-5-r50
      ERP021896
      ...
      3.969038
      15.162691
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 76 columns



In [4]:

    
metadata.columns









    Out[4]:





Index(['#SampleID', 'BarcodeSequence', 'LinkerPrimerSequence', 'Description',
       'host_subject_id', 'study_id', 'title', 'principal_investigator', 'doi',
       'ebi_accession', 'target_gene', 'target_subfragment', 'pcr_primers',
       'illumina_technology', 'extraction_center', 'run_center', 'run_date',
       'read_length_bp', 'sequences_split_libraries',
       'observations_closed_ref_greengenes', 'observations_closed_ref_silva',
       'observations_open_ref_greengenes', 'observations_deblur_90bp',
       'observations_deblur_100bp', 'observations_deblur_150bp',
       'emp_release1', 'qc_filtered', 'subset_10k', 'subset_5k', 'subset_2k',
       'sample_taxid', 'sample_scientific_name', 'host_taxid',
       'host_common_name_provided', 'host_common_name', 'host_scientific_name',
       'host_superkingdom', 'host_kingdom', 'host_phylum', 'host_class',
       'host_order', 'host_family', 'host_genus', 'host_species',
       'collection_timestamp', 'country', 'latitude_deg', 'longitude_deg',
       'depth_m', 'altitude_m', 'elevation_m', 'env_biome', 'env_feature',
       'env_material', 'envo_biome_0', 'envo_biome_1', 'envo_biome_2',
       'envo_biome_3', 'envo_biome_4', 'envo_biome_5', 'empo_0', 'empo_1',
       'empo_2', 'empo_3', 'adiv_observed_otus', 'adiv_chao1', 'adiv_shannon',
       'adiv_faith_pd', 'temperature_deg_c', 'ph', 'salinity_psu',
       'oxygen_mg_per_l', 'phosphate_umol_per_l', 'ammonium_umol_per_l',
       'nitrate_umol_per_l', 'sulfate_umol_per_l'],
      dtype='object')



In [6]:

    
# take just the columns we need for this figure panel

fig1ab = metadata.loc[:,['#SampleID','empo_0','empo_1','empo_2','empo_3','latitude_deg','longitude_deg']]
fig1ab.head()









    Out[6]:







  
    
      
      #SampleID
      empo_0
      empo_1
      empo_2
      empo_3
      latitude_deg
      longitude_deg
    
  
  
    
      0
      550.L1S1.s.1.sequence
      EMP sample
      Host-associated
      Animal
      Animal distal gut
      40.015
      -105.271
    
    
      1
      550.L1S10.s.1.sequence
      EMP sample
      Host-associated
      Animal
      Animal distal gut
      40.015
      -105.271
    
    
      2
      550.L1S100.s.1.sequence
      EMP sample
      Host-associated
      Animal
      Animal distal gut
      40.015
      -105.271
    
    
      3
      550.L1S101.s.1.sequence
      EMP sample
      Host-associated
      Animal
      Animal distal gut
      40.015
      -105.271
    
    
      4
      550.L1S102.s.1.sequence
      EMP sample
      Host-associated
      Animal
      Animal distal gut
      40.015
      -105.271

Write to Excel notebook



In [7]:

    
fig1 = pd.ExcelWriter('Figure1_data.xlsx')

fig1ab.to_excel(fig1,'Fig-1ab')

fig1.save()

	#SampleID	BarcodeSequence	LinkerPrimerSequence	Description	host_subject_id	study_id	title	principal_investigator	doi	ebi_accession	...	adiv_shannon	adiv_faith_pd	temperature_deg_c	ph	salinity_psu	oxygen_mg_per_l	phosphate_umol_per_l	ammonium_umol_per_l	nitrate_umol_per_l	sulfate_umol_per_l
0	550.L1S1.s.1.sequence	AACGCACGCTAG	GTGCCAGCMGCCGCGGTAA	sample_1 stool	F4	550	Moving pictures of the human microbiome	Rob Knight	10.1186/gb-2011-12-5-r50	ERP021896	...	4.244831	13.631804	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	550.L1S10.s.1.sequence	ACAGACCACTCA	GTGCCAGCMGCCGCGGTAA	sample_2 stool	F4	550	Moving pictures of the human microbiome	Rob Knight	10.1186/gb-2011-12-5-r50	ERP021896	...	3.027416	9.425835	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	550.L1S100.s.1.sequence	ATGCACTGGCGA	GTGCCAGCMGCCGCGGTAA	sample_3 stool	F4	550	Moving pictures of the human microbiome	Rob Knight	10.1186/gb-2011-12-5-r50	ERP021896	...	3.196420	10.491161	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	550.L1S101.s.1.sequence	ATTATCGTGCAC	GTGCCAGCMGCCGCGGTAA	sample_4 stool	F4	550	Moving pictures of the human microbiome	Rob Knight	10.1186/gb-2011-12-5-r50	ERP021896	...	3.714719	11.384689	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	550.L1S102.s.1.sequence	CACGACAGGCTA	GTGCCAGCMGCCGCGGTAA	sample_5 stool	F4	550	Moving pictures of the human microbiome	Rob Knight	10.1186/gb-2011-12-5-r50	ERP021896	...	3.969038	15.162691	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	#SampleID	empo_0	empo_1	empo_2	empo_3	latitude_deg	longitude_deg
0	550.L1S1.s.1.sequence	EMP sample	Host-associated	Animal	Animal distal gut	40.015	-105.271
1	550.L1S10.s.1.sequence	EMP sample	Host-associated	Animal	Animal distal gut	40.015	-105.271
2	550.L1S100.s.1.sequence	EMP sample	Host-associated	Animal	Animal distal gut	40.015	-105.271
3	550.L1S101.s.1.sequence	EMP sample	Host-associated	Animal	Animal distal gut	40.015	-105.271
4	550.L1S102.s.1.sequence	EMP sample	Host-associated	Animal	Animal distal gut	40.015	-105.271