Statistics on counts of OGs (columns) in Tara surface samples (rows)

  • Finding significant OGs across groups of Tara samples using ANCOM
  • Determining whether distribution of percent Tara samples found in differs for subgroups (z-test)

Does everything separately for pelag and proch data.

Import libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from skbio.stats.composition import ancom
from sys import argv
import matplotlib.pyplot as plt
%matplotlib inline


/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)
/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Assign variables


In [89]:
# species = argv[1]
# evalue = argv[2]
# clusters_path = argv[3]

In [74]:
# Prochlorococcus results
species = 'proch'
evalue = '1e-5'
myaxis = [0, 64, 0, 0.36]
clusters_path = '~/singlecell/clusters/orthomcl-pro4/groups.all_pro.list'

In [72]:
# Pelagibacter results
species = 'pelag'
evalue = '1e-5'
myaxis = [0, 64, 0, 0.36]
clusters_path = '~/singlecell/clusters/orthomcl-sar4/groups.all_sar.list'

Format and save Tara metadata


In [92]:
# Tara metadata
df_tara_names = pd.read_csv('/Users/luke/singlecell/tara/Tara_Prok139_PANGAEA_Sample.csv')
df_tara_metadata =  pd.read_csv('/Users/luke/singlecell/tara/Tara_Table_W8.csv')
df_tara_metadata = df_tara_names.merge(df_tara_metadata)

# SRF metadata
df_tara_metadata.index = df_tara_metadata['Sample label [TARA_station#_environmental-feature_size-fraction]']
index_SRF = [index for index in list(df_tara_metadata.index) if 'SRF' in index]
df_tara_metadata_SRF = df_tara_metadata.loc[index_SRF]
df_tara_metadata_SRF.index = df_tara_metadata_SRF.index

# Latitude column
df_tara_metadata_SRF['category_latitude'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='object')
for index, lat in abs(df_tara_metadata_SRF['Mean_Lat*']).iteritems():
    if lat < 23.5:
        df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'tropical'
    elif lat > 40:
        df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'temperate'
    else:
        df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'subtropical'

# Temperature column
df_tara_metadata_SRF['category_temperature'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='object')
for index, temp in df_tara_metadata_SRF['Mean_Temperature [deg C]*'].iteritems():
    if temp < 10:
        df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'polar'
    elif temp > 20:
        df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'tropical'
    else:
        df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'temperate'

# Red Sea column
df_tara_metadata_SRF['category_redsea'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='bool')
for index in df_tara_metadata_SRF.index:
    if index in ['TARA_031_SRF_0.22-1.6', 'TARA_031_SRF_<-0.22', 'TARA_032_SRF_0.22-1.6', 'TARA_032_SRF_<-0.22', 'TARA_033_SRF_0.22-1.6', 'TARA_034_SRF_0.1-0.22', 'TARA_034_SRF_0.22-1.6', 'TARA_034_SRF_<-0.22']:
        df_tara_metadata_SRF.loc[index, 'category_redsea'] = True
    else:
        df_tara_metadata_SRF.loc[index, 'category_redsea'] = False

In [93]:
# export mapping file
df_tara_metadata_SRF.to_csv('tara_metadata_SRF.tsv', sep='\t')

Format and save count data


In [75]:
# Paths of input files, containing cluster counts in Tara samples
paths = pd.Series.from_csv('/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue), header=-1, sep='\t', index_col=None)

# Data frame of non-zero cluster counts in Tara samples (NaN if missing in sample but found in others)
pieces = []
for path in paths:
    fullpath = "/Users/luke/singlecell/tara/PROK-139/%s" % path
    counts = pd.DataFrame.from_csv(fullpath, header=-1, sep='\t', index_col=0)
    pieces.append(counts)
df_nonzero = pd.concat(pieces, axis=1)
headings = paths.tolist()
df_nonzero.columns = headings

# SRF dataframe, transposed, zeros, plus 1, renamed indexes
col_SRF = [col for col in list(df_nonzero.columns) if 'SRF' in col]
df_nonzero_SRF = df_nonzero[col_SRF]
df_nonzero_SRF_T = df_nonzero_SRF.transpose()
df_nonzero_SRF_T.fillna(0, inplace=True)
df_nonzero_SRF_T_plusOne = df_nonzero_SRF_T + 1
df_nonzero_SRF_T_plusOne.index = [re.sub(species, 'TARA', x) for x in df_nonzero_SRF_T_plusOne.index]
df_nonzero_SRF_T_plusOne.index = [re.sub('_1e-5', '', x) for x in df_nonzero_SRF_T_plusOne.index]

# Dataframe of all clusters (includes clusters missing from Tara)
clusters = pd.Series.from_csv(clusters_path, header=-1, sep='\t', index_col=None)
df_all = df_nonzero.loc[clusters]
df_all_SRF = df_all[col_SRF]
df_all_SRF_T = df_all_SRF.transpose()
df_all_SRF_T.fillna(0, inplace=True)

# remove '1e-5' from count indexes
df_nonzero_SRF_T.index = [re.sub('_1e-5', '', x) for x in df_nonzero_SRF_T.index]
df_all_SRF_T.index = [re.sub('_1e-5', '', x) for x in df_all_SRF_T.index]

In [95]:
# export counts to file
df_nonzero_SRF_T.to_csv('tara_%s_nonzero_SRF.csv' % species)
df_all_SRF_T.to_csv('tara_%s_all_SRF.csv' % species)

ANCOM


In [96]:
# ANCOM with defaults alpha=0.05, tau=0.02, theta=0.1
# for grouping in ['category_latitude', 'category_temperature', 'category_redsea']:
#     results = ancom(df_nonzero_SRF_T_plusOne, df_tara_metadata_SRF[grouping], multiple_comparisons_correction='holm-bonferroni')
#     results.to_csv('ancom.%s_nonzero_SRF_T_plusOne.%s.csv' % (species, grouping))

Z-test


In [76]:
# lookup dict for genus name
dg = {
    'pelag': 'Pelagibacter',
    'proch': 'Prochlorococcus'
}

In [77]:
# load OG metadata to determine RS-only OGs
df_og_metadata = pd.read_csv('/Users/luke/singlecell/notebooks/og_metadata.tsv', sep='\t', index_col=0)

In [78]:
og_rs = df_og_metadata.index[(df_og_metadata['Red_Sea_only'] == True) & (df_og_metadata['genus'] == dg[species])]
og_other = df_og_metadata.index[(df_og_metadata['Red_Sea_only'] == False) & (df_og_metadata['genus'] == dg[species])]

In [79]:
df_all_SRF_T_rs = df_all_SRF_T[og_rs]
df_all_SRF_T_other = df_all_SRF_T[og_other]

In [80]:
count = (df_all_SRF_T > 0).sum()
count_rs = (df_all_SRF_T_rs > 0).sum()
count_other = (df_all_SRF_T_other > 0).sum()

In [17]:
# save count data
count.to_csv('hist_counts_%s_ALL_og_presence_absence_in_63_tara_srf.csv' % species)
count_rs.to_csv('hist_counts_%s_RSassoc_og_presence_absence_in_63_tara_srf.csv' % species)

In [81]:
num_samples = df_all_SRF_T.shape[0]
num_ogs = max_bin = df_all_SRF_T.shape[1]
num_ogs_rsonly = count_rs.shape[0]
num_ogs_other = count_other.shape[0]

In [82]:
# all OGs AND RS-assoc OGs
plt.figure(figsize=(10,10))
sns.distplot(count_rs, bins=np.arange(num_samples+2), color=sns.xkcd_rgb['orange'], label='Red Sea-associated ortholog groups (%s)' % num_ogs_rsonly)
sns.distplot(count, bins=np.arange(num_samples+2), color=sns.xkcd_rgb['blue'], label='All %s ortholog groups (%s)' % (dg[species], num_ogs))
plt.xlabel('Number of %s Tara surface samples found in' % num_samples, fontsize=18)
plt.ylabel('Proportion of ortholog groups', fontsize=18)
plt.xticks(np.arange(0,num_samples+1,10)+0.5, ('0', '10', '20', '30', '40', '50', '60'), fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=16, loc='upper left')
plt.axis(myaxis)
plt.savefig('hist_%s_paper_og_presence_absence_in_63_tara_srf.pdf' % species)



In [116]:
# all OGs
plt.figure(figsize=(8,6))
sns.distplot(count, bins=num_samples+1)
plt.axis([-0, num_samples, 0, .35])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs)
plt.title('Presence/absence of all %s %s OGs in %s Tara surface samples' % (num_ogs, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_all_og_presence_absence_in_63_tara_srf.pdf' % species)



In [117]:
# RS-assoc OGs
plt.figure(figsize=(8,6))
sns.distplot(count_rs, bins=num_samples+1)
plt.axis([-0, num_samples, 0, .25])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs_rsonly)
plt.title('Presence/absence of %s RS-assoc. %s OGs in %s Tara surface samples' % (num_ogs_rsonly, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_RSassoc_og_presence_absence_in_63_tara_srf.pdf' % species)



In [118]:
# other (non-RS-assoc) OGs
plt.figure(figsize=(8,6))
sns.distplot(count_other, bins=num_samples+1)
plt.axis([0, num_samples, 0, .4])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs_other)
plt.title('Presence/absence of %s non-RS-assoc. %s OGs in %s Tara surface samples' % (num_ogs_other, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_nonRSassoc_og_presence_absence_in_63_tara_srf.pdf' % species)



In [ ]: