In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from skbio.stats.composition import ancom
from sys import argv
import matplotlib.pyplot as plt
%matplotlib inline
In [89]:
# species = argv[1]
# evalue = argv[2]
# clusters_path = argv[3]
In [74]:
# Prochlorococcus results
species = 'proch'
evalue = '1e-5'
myaxis = [0, 64, 0, 0.36]
clusters_path = '~/singlecell/clusters/orthomcl-pro4/groups.all_pro.list'
In [72]:
# Pelagibacter results
species = 'pelag'
evalue = '1e-5'
myaxis = [0, 64, 0, 0.36]
clusters_path = '~/singlecell/clusters/orthomcl-sar4/groups.all_sar.list'
In [92]:
# Tara metadata
df_tara_names = pd.read_csv('/Users/luke/singlecell/tara/Tara_Prok139_PANGAEA_Sample.csv')
df_tara_metadata = pd.read_csv('/Users/luke/singlecell/tara/Tara_Table_W8.csv')
df_tara_metadata = df_tara_names.merge(df_tara_metadata)
# SRF metadata
df_tara_metadata.index = df_tara_metadata['Sample label [TARA_station#_environmental-feature_size-fraction]']
index_SRF = [index for index in list(df_tara_metadata.index) if 'SRF' in index]
df_tara_metadata_SRF = df_tara_metadata.loc[index_SRF]
df_tara_metadata_SRF.index = df_tara_metadata_SRF.index
# Latitude column
df_tara_metadata_SRF['category_latitude'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='object')
for index, lat in abs(df_tara_metadata_SRF['Mean_Lat*']).iteritems():
if lat < 23.5:
df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'tropical'
elif lat > 40:
df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'temperate'
else:
df_tara_metadata_SRF.loc[index, 'category_latitude'] = 'subtropical'
# Temperature column
df_tara_metadata_SRF['category_temperature'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='object')
for index, temp in df_tara_metadata_SRF['Mean_Temperature [deg C]*'].iteritems():
if temp < 10:
df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'polar'
elif temp > 20:
df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'tropical'
else:
df_tara_metadata_SRF.loc[index, 'category_temperature'] = 'temperate'
# Red Sea column
df_tara_metadata_SRF['category_redsea'] = pd.Series(0, index=np.arange(len(df_tara_metadata_SRF.columns)), dtype='bool')
for index in df_tara_metadata_SRF.index:
if index in ['TARA_031_SRF_0.22-1.6', 'TARA_031_SRF_<-0.22', 'TARA_032_SRF_0.22-1.6', 'TARA_032_SRF_<-0.22', 'TARA_033_SRF_0.22-1.6', 'TARA_034_SRF_0.1-0.22', 'TARA_034_SRF_0.22-1.6', 'TARA_034_SRF_<-0.22']:
df_tara_metadata_SRF.loc[index, 'category_redsea'] = True
else:
df_tara_metadata_SRF.loc[index, 'category_redsea'] = False
In [93]:
# export mapping file
df_tara_metadata_SRF.to_csv('tara_metadata_SRF.tsv', sep='\t')
In [75]:
# Paths of input files, containing cluster counts in Tara samples
paths = pd.Series.from_csv('/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue), header=-1, sep='\t', index_col=None)
# Data frame of non-zero cluster counts in Tara samples (NaN if missing in sample but found in others)
pieces = []
for path in paths:
fullpath = "/Users/luke/singlecell/tara/PROK-139/%s" % path
counts = pd.DataFrame.from_csv(fullpath, header=-1, sep='\t', index_col=0)
pieces.append(counts)
df_nonzero = pd.concat(pieces, axis=1)
headings = paths.tolist()
df_nonzero.columns = headings
# SRF dataframe, transposed, zeros, plus 1, renamed indexes
col_SRF = [col for col in list(df_nonzero.columns) if 'SRF' in col]
df_nonzero_SRF = df_nonzero[col_SRF]
df_nonzero_SRF_T = df_nonzero_SRF.transpose()
df_nonzero_SRF_T.fillna(0, inplace=True)
df_nonzero_SRF_T_plusOne = df_nonzero_SRF_T + 1
df_nonzero_SRF_T_plusOne.index = [re.sub(species, 'TARA', x) for x in df_nonzero_SRF_T_plusOne.index]
df_nonzero_SRF_T_plusOne.index = [re.sub('_1e-5', '', x) for x in df_nonzero_SRF_T_plusOne.index]
# Dataframe of all clusters (includes clusters missing from Tara)
clusters = pd.Series.from_csv(clusters_path, header=-1, sep='\t', index_col=None)
df_all = df_nonzero.loc[clusters]
df_all_SRF = df_all[col_SRF]
df_all_SRF_T = df_all_SRF.transpose()
df_all_SRF_T.fillna(0, inplace=True)
# remove '1e-5' from count indexes
df_nonzero_SRF_T.index = [re.sub('_1e-5', '', x) for x in df_nonzero_SRF_T.index]
df_all_SRF_T.index = [re.sub('_1e-5', '', x) for x in df_all_SRF_T.index]
In [95]:
# export counts to file
df_nonzero_SRF_T.to_csv('tara_%s_nonzero_SRF.csv' % species)
df_all_SRF_T.to_csv('tara_%s_all_SRF.csv' % species)
In [96]:
# ANCOM with defaults alpha=0.05, tau=0.02, theta=0.1
# for grouping in ['category_latitude', 'category_temperature', 'category_redsea']:
# results = ancom(df_nonzero_SRF_T_plusOne, df_tara_metadata_SRF[grouping], multiple_comparisons_correction='holm-bonferroni')
# results.to_csv('ancom.%s_nonzero_SRF_T_plusOne.%s.csv' % (species, grouping))
In [76]:
# lookup dict for genus name
dg = {
'pelag': 'Pelagibacter',
'proch': 'Prochlorococcus'
}
In [77]:
# load OG metadata to determine RS-only OGs
df_og_metadata = pd.read_csv('/Users/luke/singlecell/notebooks/og_metadata.tsv', sep='\t', index_col=0)
In [78]:
og_rs = df_og_metadata.index[(df_og_metadata['Red_Sea_only'] == True) & (df_og_metadata['genus'] == dg[species])]
og_other = df_og_metadata.index[(df_og_metadata['Red_Sea_only'] == False) & (df_og_metadata['genus'] == dg[species])]
In [79]:
df_all_SRF_T_rs = df_all_SRF_T[og_rs]
df_all_SRF_T_other = df_all_SRF_T[og_other]
In [80]:
count = (df_all_SRF_T > 0).sum()
count_rs = (df_all_SRF_T_rs > 0).sum()
count_other = (df_all_SRF_T_other > 0).sum()
In [17]:
# save count data
count.to_csv('hist_counts_%s_ALL_og_presence_absence_in_63_tara_srf.csv' % species)
count_rs.to_csv('hist_counts_%s_RSassoc_og_presence_absence_in_63_tara_srf.csv' % species)
In [81]:
num_samples = df_all_SRF_T.shape[0]
num_ogs = max_bin = df_all_SRF_T.shape[1]
num_ogs_rsonly = count_rs.shape[0]
num_ogs_other = count_other.shape[0]
In [82]:
# all OGs AND RS-assoc OGs
plt.figure(figsize=(10,10))
sns.distplot(count_rs, bins=np.arange(num_samples+2), color=sns.xkcd_rgb['orange'], label='Red Sea-associated ortholog groups (%s)' % num_ogs_rsonly)
sns.distplot(count, bins=np.arange(num_samples+2), color=sns.xkcd_rgb['blue'], label='All %s ortholog groups (%s)' % (dg[species], num_ogs))
plt.xlabel('Number of %s Tara surface samples found in' % num_samples, fontsize=18)
plt.ylabel('Proportion of ortholog groups', fontsize=18)
plt.xticks(np.arange(0,num_samples+1,10)+0.5, ('0', '10', '20', '30', '40', '50', '60'), fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=16, loc='upper left')
plt.axis(myaxis)
plt.savefig('hist_%s_paper_og_presence_absence_in_63_tara_srf.pdf' % species)
In [116]:
# all OGs
plt.figure(figsize=(8,6))
sns.distplot(count, bins=num_samples+1)
plt.axis([-0, num_samples, 0, .35])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs)
plt.title('Presence/absence of all %s %s OGs in %s Tara surface samples' % (num_ogs, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_all_og_presence_absence_in_63_tara_srf.pdf' % species)
In [117]:
# RS-assoc OGs
plt.figure(figsize=(8,6))
sns.distplot(count_rs, bins=num_samples+1)
plt.axis([-0, num_samples, 0, .25])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs_rsonly)
plt.title('Presence/absence of %s RS-assoc. %s OGs in %s Tara surface samples' % (num_ogs_rsonly, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_RSassoc_og_presence_absence_in_63_tara_srf.pdf' % species)
In [118]:
# other (non-RS-assoc) OGs
plt.figure(figsize=(8,6))
sns.distplot(count_other, bins=num_samples+1)
plt.axis([0, num_samples, 0, .4])
plt.xlabel('Number of %s Tara surface samples found in' % num_samples)
plt.ylabel('Proportion of %s OGs' % num_ogs_other)
plt.title('Presence/absence of %s non-RS-assoc. %s OGs in %s Tara surface samples' % (num_ogs_other, species, num_samples))
plt.axis(myaxis)
plt.savefig('hist_%s_nonRSassoc_og_presence_absence_in_63_tara_srf.pdf' % species)
In [ ]: