author: lukethompson@gmail.com
date: 7 Oct 2017
language: Python 3.5
license: BSD3

effect_size_main.ipynb



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Merging the cleaned mapping file (output of clean_map_emp.py) with alpha diversity for redundancy analysis



In [46]:

    
path_map = '../../data/mapping-files/emp_qiime_mapping_qc_filtered.tsv'
df_map = pd.read_csv(path_map, sep='\t', index_col=0)









    



/Users/luke.thompson/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (46,47,48,49,50,64,65,66,67) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

for adiv



In [47]:

    
# df_qrt is (now also) metadata in deciles for alpha-diversity analysis
df_qrt = pd.read_csv('../../data/effect-size/emp_qiime_mapping_qc_filtered.tsv.quartiles.tsv', sep='\t', index_col=0)
#df_dec = pd.read_csv('../../data/effect-size/emp_qiime_mapping_qc_filtered.tsv.deciles.tsv', sep='\t', index_col=0)



In [48]:

    
df_qrt['adiv_observed_otus'] = [df_map['adiv_observed_otus'][i] for i in df_qrt.index]
df_qrt.drop('oxygen_mg_per_l', axis=1, inplace=True)



In [49]:

    
path_adiv = '../../data/effect-size/emp_qiime_mapping_qc_filtered.quartiles.adiv.tsv'
df_qrt.to_csv(path_adiv, sep='\t')

for bdiv - unweighted



In [7]:

    
# df_qrt is metadata in quartiles for beta-diversity analysis
df_qrt = pd.read_csv('../../data/effect-size/emp_qiime_mapping_qc_filtered.tsv.quartiles.tsv', sep='\t', index_col=0)



In [8]:

    
path_bdiv = '../../data/beta-div/emp_90_gg_1k_unweighted_unifrac.txt.pc.first_ten'
df_bdiv = pd.read_csv(path_bdiv, sep='\t', index_col=0, header=None)
df_bdiv.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']



In [9]:

    
for col in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']:
    df_qrt[col] = [df_bdiv[col][i] for i in df_qrt.index]



In [10]:

    
path_bdiv = '../../data/effect-size/emp_qiime_mapping_qc_filtered.quartiles.bdiv_uw.tsv'
df_qrt.to_csv(path_bdiv, sep='\t')

for bdiv - weighted



In [11]:

    
# df_qrt is metadata in quartiles for beta-diversity analysis
df_qrt = pd.read_csv('../../data/effect-size/emp_qiime_mapping_qc_filtered.tsv.quartiles.tsv' , sep='\t', index_col=0)



In [12]:

    
path_bdiv = '../../data/beta-div/emp_90_gg_1k_weighted_unifrac.txt.pc.first_ten'
df_bdiv = pd.read_csv(path_bdiv, sep='\t', index_col=0, header=None)
df_bdiv.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']



In [13]:

    
# df_qrt is metadata in quartiles for beta-diversity analysis
for col in ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']:
    df_qrt[col] = [df_bdiv[col][i] for i in df_qrt.index]



In [14]:

    
path_bdiv = '../../data/effect-size/emp_qiime_mapping_qc_filtered.quartiles.bdiv_w.tsv'
df_qrt.to_csv(path_bdiv, sep='\t')

Exploring the cleaned mapping file (output of clean_map_emp.py)



In [15]:

    
df_dec.study_id.value_counts()









    Out[15]:





1883.0    2438
550.0     1957
894.0     1922
2192.0    1599
1288.0    1483
2229.0    1292
945.0     1103
755.0      893
990.0      698
1521.0     688
1642.0     625
1774.0     599
1632.0     527
1736.0     502
1694.0     450
2382.0     400
925.0      361
1622.0     348
933.0      323
1453.0     298
1064.0     285
678.0      271
1235.0     258
1721.0     255
864.0      229
1024.0     227
1748.0     221
940.0      219
1747.0     184
1621.0     180
722.0      175
2182.0     164
1696.0     157
1030.0     147
1674.0     146
1240.0     141
1665.0     135
1579.0     126
1773.0     121
1035.0     113
1197.0     103
2338.0     103
1526.0      95
1056.0      92
1242.0      91
963.0       91
638.0       89
1034.0      89
1481.0      88
723.0       85
804.0       78
1692.0      75
Name: study_id, dtype: int64



In [16]:

    
df_dec.host_scientific_name.value_counts()









    Out[16]:





Not applicable            12941
Homo sapiens               3612
Ecklonia radiata            936
Rusa unicolor               906
Phyllospora comosa          656
Oryza sativa                615
Syncerus caffer caffer      502
Macropus sp.                460
Pemphiginae                 450
Pygathrix nemaeus           298
Apis mellifera              285
Oryctolagus cuniculus       259
Bos taurus                  180
Vombatidae                  169
Ateles geoffroyi            160
Varanus komodoensis         155
Catostomus                  147
Canis lupus familiaris      104
Iguana iguana                91
Pica pica                    87
Carollia perspicillata       76
Name: host_scientific_name, dtype: int64



In [17]:

    
df_dec.latitude_deg.value_counts()









    Out[17]:





4.0    3581
7.0    2792
2.0    2593
0.0    2557
6.0    2503
9.0    2472
1.0    2429
3.0    2422
8.0    2189
5.0    1363
Name: latitude_deg, dtype: int64



In [18]:

    
df_dec.longitude_deg.value_counts()









    Out[18]:





1.0    4396
8.0    2867
5.0    2679
3.0    2644
0.0    2495
7.0    2494
6.0    2315
9.0    2095
2.0    1828
4.0    1088
Name: longitude_deg, dtype: int64



In [19]:

    
df_dec.envo_biome_3.value_counts()









    Out[19]:





dense settlement biome         7300
Not applicable                 5323
freshwater lake biome          4372
freshwater river biome         2247
cropland biome                 1843
rangeland biome                 895
subtropical shrubland biome     527
village biome                   502
subtropical woodland biome      450
broadleaf forest biome          379
marine benthic biome            314
montane grassland biome         229
polar desert biome              147
tropical shrubland biome        126
marine pelagic biome             85
mixed forest biome               78
Name: envo_biome_3, dtype: int64



In [20]:

    
df_dec.empo_3.value_counts()









    Out[20]:





Water (non-saline)       5080
Soil (non-saline)        4376
Animal distal gut        4196
Animal surface           3454
Plant surface            1615
Surface (non-saline)     1289
Animal secretion         1285
Water (saline)            685
Plant rhizosphere         632
Sediment (saline)         629
Sediment (non-saline)     553
Animal proximal gut       384
Animal corpus             338
Plant corpus              177
Surface (saline)          117
Aerosol (non-saline)       87
Name: empo_3, dtype: int64



In [21]:

    
df_dec.temperature_deg_c.value_counts()









    Out[21]:





7.0    775
3.0    759
0.0    725
1.0    722
2.0    719
5.0    716
9.0    716
6.0    716
4.0    677
8.0    660
Name: temperature_deg_c, dtype: int64



In [22]:

    
df_dec.ph.value_counts()









    Out[22]:





2.0    495
0.0    444
4.0    437
6.0    420
8.0    416
9.0    411
7.0    410
3.0    400
1.0    398
5.0    316
Name: ph, dtype: int64



In [23]:

    
df_dec.salinity_psu.value_counts()









    Out[23]:





4.0    83
3.0    69
8.0    69
7.0    65
1.0    65
0.0    63
2.0    61
9.0    54
6.0    54
Name: salinity_psu, dtype: int64



In [25]:

    
#df_dec.oxygen_mg_per_l.value_counts()



In [26]:

    
df_dec.nitrate_umol_per_l.value_counts()









    Out[26]:





5.0    240
0.0    223
7.0    198
4.0    195
3.0    193
9.0    191
8.0    190
2.0    188
1.0    170
6.0    145
Name: nitrate_umol_per_l, dtype: int64

Plotting effect size results



In [2]:

    
sns.set_style('ticks', {'ytick.color': 'white'})
sns.set_context('poster', font_scale=1.5)
mycolor = sns.xkcd_rgb['light blue']



In [3]:

    
cols = ['study_id', 'host_scientific_name',
    'latitude_deg', 'longitude_deg',
    'envo_biome_3', 'empo_3', 
    'temperature_deg_c', 'ph', 'salinity_psu',
    'oxygen_mg_per_l', 'nitrate_umol_per_l']

Alpha-div



In [4]:

    
results_q = pd.read_csv('../../data/effect-size/' +
                      'mannwhitneyu.alpha_map.adiv_observed_otus.quartiles.tsv', 
                      sep='\t', index_col=0)



In [5]:

    
results_d = pd.read_csv('../../data/effect-size/' +
                      'mannwhitneyu.alpha_map.adiv_observed_otus.deciles.tsv', 
                      sep='\t', index_col=0)



In [7]:

    
myresults_q = results_q.loc[results_q['column_name'].isin(cols)]
myresults_q = myresults_q.sort_values('effect_size', ascending=False)



In [8]:

    
myresults_d = results_d.loc[results_d['column_name'].isin(cols)]
myresults_d = myresults_d.sort_values('effect_size', ascending=False)



In [9]:

    
dict_colors = {'study_id':sns.color_palette('Paired', 12)[1], 'empo_3':sns.color_palette('Paired', 12)[3], 
               'envo_biome_3':sns.color_palette('Paired', 12)[5],'latitude_deg':sns.color_palette('Paired', 12)[7], 
               'longitude_deg':sns.color_palette('Paired', 12)[9], 'host_scientific_name':sns.color_palette('Paired', 12)[0], 
               'salinity_psu':sns.color_palette('Paired', 12)[2], 'temperature_deg_c':sns.color_palette('Paired', 12)[4],
               'ph':sns.color_palette('Paired', 12)[6], 'nitrate_umol_per_l':sns.color_palette('Paired', 12)[8], 
               'oxygen_mg_per_l':sns.color_palette('Paired', 12)[10]}



In [11]:

    
sns.barplot(x='effect_size', y='column_name', data=myresults_d, palette=dict_colors)
plt.xlabel('Mean pairwise effect size')
plt.ylabel('')
plt.tight_layout()
plt.savefig('adiv_effect_size_deciles.pdf')



In [12]:

    
sns.barplot(x='effect_size', y='column_name', data=myresults_q, palette=dict_colors)
plt.xlabel('Mean pairwise effect size')
plt.ylabel('Factor')
plt.tight_layout()
plt.savefig('adiv_effect_size_quartiles.pdf')

Beta-div



In [13]:

    
results_w = pd.read_csv('../../data/effect-size/' +
                      'emp_90_gg_1k_weighted_unifrac.emp_qiime_mapping_qc_filtered.quartiles.tsv', 
                      sep='\t', index_col=0)



In [14]:

    
myresults_w = results_w.loc[results_w['column_name'].isin(cols)]
myresults_w = myresults_w.sort_values('effect_size', ascending=False)



In [15]:

    
sns.barplot(x='effect_size', y='column_name', data=myresults_w, palette=dict_colors)
plt.xlabel('Mean pairwise effect size')
plt.ylabel('Factor')
plt.tight_layout()
plt.savefig('bdiv_weighted_effect_size_quartiles.pdf')



In [16]:

    
results_uw = pd.read_csv('../../data/effect-size/' +
                      'emp_90_gg_1k_unweighted_unifrac.emp_qiime_mapping_qc_filtered.quartiles.tsv', 
                      sep='\t', index_col=0)



In [17]:

    
myresults_uw = results_uw.loc[results_uw['column_name'].isin(cols)]
myresults_uw = myresults_uw.sort_values('effect_size', ascending=False)



In [18]:

    
sns.barplot(x='effect_size', y='column_name', data=myresults_uw, palette=dict_colors)
plt.xlabel('Mean pairwise effect size')
plt.ylabel('Factor')
plt.tight_layout()
plt.savefig('bdiv_unweighted_effect_size_quartiles.pdf')

Plotting RDA results



In [17]:

    
sns.set_style('ticks', {'ytick.color': 'black'})



In [18]:

    
path_rda = '../../data/effect-size/rda_values.xlsx'



In [19]:

    
df_rda = pd.read_excel(path_rda)



In [20]:

    
df_rda.fillna(0, inplace=True)



In [21]:

    
df_rda









    Out[21]:






  
    
      
      study_id
      empo_3
      envo_biome_3
      longitude_deg
      latitude_deg
      response
      incl_study_id
    
  
  
    
      alpha_with_study_id
      0.6957
      0.0616
      0.0136
      0.0011
      0.0000
      alpha: observed seqs
      yes
    
    
      alpha_without_study_id
      0.0000
      0.5842
      0.0658
      0.0175
      0.0159
      alpha: observed seqs
      no
    
    
      beta_uw_with_study_id
      0.6407
      0.0920
      0.0030
      0.0001
      0.0007
      beta: unweighted unifrac
      yes
    
    
      beta_uw_without_study_id
      0.0000
      0.5072
      0.1075
      0.0185
      0.0335
      beta: unweighted unifrac
      no
    
    
      beta_w_with_study_id
      0.5412
      0.1041
      0.0017
      0.0000
      0.0024
      beta: weighted unifrac
      yes
    
    
      beta_w_without_study_id
      0.0000
      0.4712
      0.0816
      0.0201
      0.0000
      beta: weighted unifrac
      no



In [22]:

    
mycolors = ["dark grey", "windows blue", "amber", "faded green", "dusty purple"]



In [23]:

    
plt.figure(figsize=(10,10))

plt.bar(np.arange(6), df_rda.study_id, color=dict_colors['study_id'])
bottom = df_rda.study_id
plt.bar(np.arange(6), df_rda.empo_3, bottom=bottom, color=dict_colors['empo_3'])
bottom = bottom + df_rda.empo_3
plt.bar(np.arange(6), df_rda.envo_biome_3, bottom=bottom, color=dict_colors['envo_biome_3'])
bottom = bottom + df_rda.envo_biome_3
plt.bar(np.arange(6), df_rda.latitude_deg, bottom=bottom, color=dict_colors['latitude_deg'])
bottom = bottom + df_rda.latitude_deg
plt.bar(np.arange(6), df_rda.longitude_deg, bottom=bottom, color=dict_colors['longitude_deg'])

plt.xticks(np.arange(6), ['alpha w/ study_id', 'alpha w/o study_id', 'unweighted w/ study_id', 'unweighted w/o study_id', 
                          'weighted w/ study_id', 'weighted w/o study_id'], rotation=270)

plt.tight_layout()

plt.savefig('rda.pdf')



In [ ]:

	study_id	empo_3	envo_biome_3	longitude_deg	latitude_deg	response	incl_study_id
alpha_with_study_id	0.6957	0.0616	0.0136	0.0011	0.0000	alpha: observed seqs	yes
alpha_without_study_id	0.0000	0.5842	0.0658	0.0175	0.0159	alpha: observed seqs	no
beta_uw_with_study_id	0.6407	0.0920	0.0030	0.0001	0.0007	beta: unweighted unifrac	yes
beta_uw_without_study_id	0.0000	0.5072	0.1075	0.0185	0.0335	beta: unweighted unifrac	no
beta_w_with_study_id	0.5412	0.1041	0.0017	0.0000	0.0024	beta: weighted unifrac	yes
beta_w_without_study_id	0.0000	0.4712	0.0816	0.0201	0.0000	beta: weighted unifrac	no