In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))
%matplotlib inline
In [2]:
COLORS = sns.color_palette("colorblind", 10)
TAXONOMIC_PROFILER = 'burst'
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ["tongue_dorsum", "stool", "supragingival_plaque", "right_retroauricular_crease",
"left_retroauricular_crease", "subgingival_plaque"]
CLASSES_MAP = dict(zip(CLASSES, ("oral", "stool", "oral", "skin", "skin", "oral")))
In [3]:
def save_plot(fig, pltname, artists=()):
fig.savefig(os.path.join("..", "figures", "hmp_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')
In [4]:
# Load up the mapping file
mapping_df = pd.read_csv('../data/hmp/HMASM-690.csv', sep=",")
mapping_df.dtypes
Out[4]:
In [5]:
# What are these classifications?
set(mapping_df['Body Site'])
Out[5]:
In [6]:
# Get the fasta depths of each
from collections import defaultdict
depths_dict = defaultdict(int)
files = glob.glob("../results/HMP/**/count.seqs.txt")
for file in files:
sequence_depth = !cat {file}
depth = file.split('/')[-2].split('_')[1]
depths_dict[depth] += int(sequence_depth[0])
In [7]:
depths_dict
Out[7]:
In [8]:
# Load up the list of taxonomic profilers
taxonomic_profilers = ['utree', 'burst', 'burst.taxonomy', 'bowtie2']
# Select BURST capitalist
In [9]:
# Merge taxatables
def fetch_merged_taxatable(depth):
files = glob.glob("../results/HMP/*_{}_*/taxatable.{}.species.txt".format(depth, TAXONOMIC_PROFILER))
df = pd.read_csv(files[0], sep="\t")
for file in files[1:]:
df_tmp = pd.read_csv(file, sep="\t")
df = pd.concat([df, df_tmp])
df = df.groupby("#OTU ID").sum()
return df.fillna(0)
df = fetch_merged_taxatable('01')
df.head()
Out[9]:
In [10]:
hit_rate = {}
# Compare samples to mapping file
fulldepth_df = fetch_merged_taxatable('fulldepth')
sample_ids = [_.split('.')[0] for _ in fulldepth_df.columns]
temp_fulldepth_df = fulldepth_df.T
hit_rate['fulldepth'] = fulldepth_df.sum().sum()/depths_dict['fulldepth']
temp_fulldepth_df['SRS ID'] = sample_ids
join_df = temp_fulldepth_df.merge(mapping_df, on="SRS ID", how="inner")
join_df["Body Site"].describe()
Out[10]:
In [11]:
join_df["Summary Site"] = [CLASSES_MAP[_] for _ in join_df["Body Site"]]
join_df["Summary Site"].describe()
Out[11]:
In [12]:
pd.value_counts(join_df["Summary Site"])
Out[12]:
In [13]:
mean_df = join_df.groupby("Summary Site").mean()
In [14]:
oral_taxa = mean_df.T["oral"].sort_values(ascending=False).head(100)
oral_taxa = np.round(oral_taxa*(10000000/oral_taxa.sum())).astype(int)
In [15]:
skin_taxa = mean_df.T["skin"].sort_values(ascending=False).head(100)
skin_taxa = np.round(skin_taxa*(10000000/skin_taxa.sum())).astype(int)
In [16]:
stool_taxa = mean_df.T["stool"].sort_values(ascending=False).head(100)
stool_taxa = np.round(stool_taxa*(10000000/stool_taxa.sum())).astype(int)
In [17]:
df_sims = pd.DataFrame([stool_taxa, skin_taxa, oral_taxa])
df_sims = df_sims.fillna(0)
df_sims.T.to_csv("../results/HMP/taxatable.meta.10m.txt", sep="\t", index_label="#OTU ID")
In [18]:
pd.value_counts(join_df["Body Site"])
Out[18]:
In [19]:
join_df[join_df["Body Site"] == "left_retroauricular_crease"]["SRS ID"]
Out[19]:
In [20]:
# classes = set(('IGT', 'NGT', 'T2D'))
# join_df = join_df[join_df['Classification '].isin(classes)]
# join_df.index = join_df["#Sample ID"]
join_df = join_df.sort_index()
In [21]:
def normalize_taxatable(df, prev=lambda df: int(df.shape[1]/30), percent=0, map_sample_ids=True, normalize_relative_abundance=True):
# Features X Samples
# Redistribute to median depth
df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
if map_sample_ids:
sample_ids = [_.split('.')[0] for _ in df.columns]
df.columns = sample_ids
if normalize_relative_abundance:
df = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df.apply(lambda x: x.mean(), axis=1) > percent]
else:
df_temp = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df_temp.apply(lambda x: x.mean(), axis=1) > percent]
# print(df.apply(lambda x: (x > 0).sum(), axis=1))
df = df[df.apply(lambda x: (x >= 1).sum(), axis=1) > prev(df)]
if normalize_relative_abundance:
df = df.apply(lambda x: x/x.sum(), axis=0)
return df.sort_index(axis=0).sort_index(axis=1)
In [22]:
# fulldepth_df.sum()
In [23]:
# Normalize the full depth taxatable
fulldepth_df = normalize_taxatable(fulldepth_df, percent=0.002, normalize_relative_abundance=False)
fulldepth_df[fulldepth_df == 0] = .65
fulldepth_df = fulldepth_df.apply(lambda x: x/x.sum(), axis=0)
In [24]:
fulldepth_df.head()
Out[24]:
In [25]:
if CLR:
data = fulldepth_df.T.values
data = clr(data)
fulldepth_df = pd.DataFrame(data, index=fulldepth_df.columns, columns=fulldepth_df.index)
else:
fulldepth_df = fulldepth_df.T
In [26]:
fulldepth_df.shape
Out[26]:
In [27]:
def inverse_simpson(row):
row = row/np.sum(row)
return np.log2(1/np.sum(np.power(row, 2)))
In [28]:
depths = ["1", "01", "001", "0001"]
dice_coefficients = []
alpha_diversity = []
df_depths = {'fulldepth': fulldepth_df}
for depth in depths:
depth_df = fetch_merged_taxatable(depth)
sample_ids = [_.split('.')[0] for _ in depth_df.columns]
hit_rate[depth] = depth_df.sum().sum()/depths_dict[depth]
depth_df.columns = sample_ids
temp = pd.DataFrame(np.zeros(fulldepth_df.shape), index=fulldepth_df.index, columns=fulldepth_df.columns)
for index in depth_df.index:
if index in temp.columns:
temp[str(index)] = depth_df.loc[str(index), :]
depth_df = temp
depth_df = normalize_taxatable(depth_df.T, prev=lambda df: 0, map_sample_ids=False, normalize_relative_abundance=False)
for _, row in depth_df.T.iterrows():
dice_coefficients.append([np.log10(float("." + str(depth))), dice(row, np.ones(len(row))), TAXONOMIC_PROFILER])
alpha_diversity.append([_, np.log10(float("." + str(depth))), inverse_simpson(row), TAXONOMIC_PROFILER])
depth_df[depth_df == 0] = .65
depth_df = depth_df.apply(lambda x: x/x.sum(), axis=0)
depth_df = depth_df.T
if CLR:
data = depth_df.values
data = clr(data)
depth_df = pd.DataFrame(data, index=depth_df.index, columns=depth_df.columns)
df_depths[depth] = depth_df
In [29]:
df_dice = pd.DataFrame(dice_coefficients, columns=['depth', 'correlation', 'profiler'])
In [30]:
df_alpha = pd.DataFrame(alpha_diversity, columns=['SRS ID', 'depth', 'alpha', 'profiler'])
In [31]:
df_alpha = df_alpha.merge(mapping_df, on=["SRS ID"], how='inner')
df_alpha.tail()
Out[31]:
In [32]:
# df_result_cv_scores['mean'] = df_result_cv_scores.values[:,1:].mean(axis=1)
# df_result_cv_scores = df_result_cv_scores.sort_values('mean')
df_alpha.to_csv('../figures/hmp_alpha_table.txt')
fig = sns.factorplot(x="Body Site", y="alpha", hue="depth", data=df_alpha, palette=COLORS, kind="box", size=5, aspect=3)
fig.set_ylabels("log2(relative abundance alpha diversity)").set_xlabels("Depth (log10)")
pltname = "alpha_diversity"
save_plot(fig, pltname)
In [33]:
mapping_df['SRS ID']
Out[33]:
In [34]:
hit_rate
# multiple box plots on one figure
fig = plt.figure()
plt.boxplot(list(hit_rate.values()), labels=[""])
plt.ylabel("Percent of Reads that Hit the Database")
plt.title("Hit Rate")
save_plot(fig, "hit_rate")
In [35]:
def multiclass_roc(X, y, classifier=CLASSIFIER(), classes=['IGT', 'NGT', 'T2D']):
# Binarize the output
n_classes = len(classes)
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, stratify=y,
random_state=0)
# Learn to predict each class against the other
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return fpr, tpr, roc_auc
In [36]:
def plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=2, classes=['IGT', 'NGT', 'T2D']):
fig = plt.figure()
lw = 2
plt.plot(fpr[ix], tpr[ix], color=COLORS[0],
lw=lw, label='%s ROC curve (area = %0.2f)' % (classes[ix], roc_auc[ix]), alpha=.8)
plt.plot([0, 1], [0, 1], color=COLORS[1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if depth != 'fulldepth':
plt.title('Receiver Operating Characteristic %s depth=.%s' % (classes[ix], str(depth)))
else:
plt.title('Receiver Operating Characteristic %s' % (classes[ix]))
plt.legend(loc="lower right")
save_plot(fig, 'roc_{}_{}'.format(depth, classes[ix]))
# Compute macro-average ROC curve and ROC area
def plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D']):
lw=2
n_classes = len(classes)
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
fig = plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4, alpha=.8)
for i, color, target_name in zip(range(n_classes), COLORS, classes):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'.format(target_name, roc_auc[i]), alpha=.8)
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if depth != 'fulldepth':
plt.title('Multiclass Receiver Operating Characteristic depth=.%s' % str(depth))
else:
plt.title('Multiclass Receiver Operating Characteristic')
plt.legend(loc="lower right")
save_plot(fig, 'multiroc_{}'.format(depth))
In [37]:
for depth, df in df_depths.items():
temp_y = join_df["Body Site"].values
temp_y = label_binarize(temp_y, classes=CLASSES)
# mask = temp_y[:, 0] == 0
# df = df[mask]
# temp_y = temp_y[mask]
# temp_y = temp_y[:, 1:]
# Make full depth classifier with 1-CV
fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=CLASSES)
plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=CLASSES)
# for i in range(3):
# plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)
In [38]:
result_cv_scores = []
for depth, df in df_depths.items():
temp_y = join_df["Body Site"].values
temp_y = label_binarize(temp_y, classes=CLASSES)
mask = temp_y[:, 0] == 0
df = df[mask]
temp_y = temp_y[mask]
temp_y = temp_y[:, 1:]
# Make full depth classifier with 1-CV
# fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=CLASSES)
# Glitches at past 5
scores = cross_val_score(CLASSIFIER(), df, temp_y, cv=10, scoring="f1_micro")
for score in scores:
if depth == 'fulldepth':
depth_name = 0
else:
depth_name = np.log10(float("." + str(depth)))
result_cv_scores.append([depth_name, score, TAXONOMIC_PROFILER])
# for i in range(len(CLASSES)):
# plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i, classes=CLASSES)
df_result_cv_scores = pd.DataFrame(result_cv_scores, columns=['depth', 'score', 'profiler'])
In [39]:
# df_result_cv_scores['mean'] = df_result_cv_scores.values[:,1:].mean(axis=1)
# df_result_cv_scores = df_result_cv_scores.sort_values('mean')
df_result_cv_scores.to_csv('../figures/hmp_f1_cv_table.txt')
g = sns.factorplot(x="depth", y="score", kind="box", data=df_result_cv_scores, palette=COLORS, legend=False)
g.set_ylabels("F1 Micro").set_xlabels("Depth (log10)")
plt.title("Classification Accuracy by Depth")
pltname = "f1"
save_plot(g, pltname)
In [40]:
def plot_pca(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
n_classes = len(classes)
fulldepth_df = df_depths['fulldepth']
pca = PCA(n_components=2)
pca.fit(fulldepth_df)
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
for depth, df in df_depths.items():
X_r = pca.transform(df)
fig = plt.figure()
lw = 2
for color, target_name in zip(colors, classes):
plt.scatter(X_r[y == target_name, 0], X_r[y == target_name, 1], color=color, alpha=.8, lw=lw,
label=target_name)
lgd = plt.legend(shadow=False, scatterpoints=1, loc='center left', bbox_to_anchor=(1, 0.5))
if depth != 'fulldepth':
plt.title('PCA of HMP dataset depth=.%s' % str(depth))
else:
plt.title('PCA of HMP dataset')
save_plot(fig, 'pca_{}_nclasses{}'.format(depth, str(n_classes)), artists=(lgd,))
In [41]:
plot_pca(df_depths, join_df["Body Site"].values, classes=CLASSES)
In [42]:
def plot_lda(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
fulldepth_df = df_depths['fulldepth']
n_classes = len(classes)
lda = LinearDiscriminantAnalysis(n_components=2)
X_r2 = lda.fit(fulldepth_df, y).transform(fulldepth_df)
lw = 2
for depth, df in df_depths.items():
X_r2 = lda.transform(df)
fig = plt.figure()
for color, target_name in zip(colors, classes):
plt.scatter(X_r2[y == target_name, 0], X_r2[y == target_name, 1], alpha=.8, color=color, lw=lw, label=target_name)
lgd = plt.legend(shadow=False, scatterpoints=1, loc='center left', bbox_to_anchor=(1, 0.5))
if depth != 'fulldepth':
plt.title('LDA of HMP dataset depth=.%s' % str(depth))
else:
plt.title('LDA of HMP dataset')
save_plot(fig, 'lda_{}'.format(depth), artists=(lgd,))
In [43]:
plot_lda(df_depths, join_df["Body Site"].values, classes=CLASSES)
In [44]:
# Spearman Correlation
from scipy.stats import pearsonr
def pearson_correlations(df_depths):
pearson_correlations = []
fulldepth_df = df_depths['fulldepth']
for depth, df in df_depths.items():
if depth != 'fulldepth':
for i, row in fulldepth_df.iterrows():
pearson_correlations.append([np.log10(float("." + str(depth))), pearsonr(row, df.loc[i])[0], 'burst'])
pearson_correlations.append([0, 1.0, 'burst'])
return pd.DataFrame(pearson_correlations, columns=['depth', 'correlation', 'profiler'])
In [45]:
pearson_df = pearson_correlations(df_depths)
In [46]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=pearson_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels(" Correlation").set_xlabels("Depth (log10)")
plt.title("Pearson Correlation with Full Depth")
pltname = "pearson_correlation"
save_plot(g, pltname)
In [47]:
# Spearman Correlation
from scipy.stats import spearmanr
def spearman_correlations(df_depths):
spearman_correlations = []
fulldepth_df = df_depths['fulldepth']
for depth, df in df_depths.items():
if depth != 'fulldepth':
for i, row in fulldepth_df.iterrows():
spearman_correlations.append([np.log10(float("." + str(depth))), spearmanr(row, df.loc[i])[0], 'burst'])
spearman_correlations.append([0, 1.0, 'burst'])
return pd.DataFrame(spearman_correlations, columns=['depth', 'correlation', 'profiler'])
In [48]:
spearman_df = spearman_correlations(df_depths)
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=spearman_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Spearman Correlation").set_xlabels("Depth (log10)")
plt.title("Spearman Correlation with Full Depth")
pltname = "spearman_correlation"
save_plot(g, pltname)
In [49]:
# from skbio.stats.composition import ancom, clr_inv
# results = []
# for depth, df in df_depths.items():
# if CLR:
# data = clr_inv(df)
# df = pd.DataFrame(data, index=df.index, columns=df.columns)
# results.append([depth, ancom(df, join_df["Body Site"])])
In [50]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=df_dice, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Dice Coefficients").set_xlabels("Depth (log10)")
plt.title("Dice Coefficients with Full Depth")
pltname = "dice_coefficients"
save_plot(g, pltname)
In [51]:
print(CLASSES)
In [55]:
from skbio.stats.composition import ancom, clr_inv
results = []
for depth, df in df_depths.items():
data = clr_inv(df)
df = pd.DataFrame(data, index=df.index, columns=df.columns)
df.to_csv("../results/hmp_{}_otu_table.txt".format(depth))