In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))
%matplotlib inline
In [2]:
COLORS = sns.color_palette("colorblind", 5)
TAXONOMIC_PROFILER = 'burst'
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ['NGT', 'T2D']
RUN_TWO_CLASS_ROC_AUC = False
In [3]:
def save_plot(fig, pltname, artists=()):
fig.savefig(os.path.join("..", "figures", "karlsson2013_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')
In [4]:
# Load up the mapping file
mapping_df = pd.read_csv('../data/karlsson2013/map.txt', sep="\t")
mapping_df.dtypes
Out[4]:
In [5]:
# What are these classifications?
set(mapping_df['Classification '])
# Disease Classification NGT = normal glucose tolerance; IGT = intermediate gluc tolerance (i.e. pre-diabetic); T2D = type 2 diabetes
Out[5]:
In [6]:
# Get the fasta depths of each
depths_dict = {}
files = glob.glob("../results/karlsson2013/**/count.seqs.txt")
for file in files:
sequence_depth = !cat {file}
depth = file.split('/')[-2].split('_')[0]
depths_dict[depth] = int(sequence_depth[0])
In [38]:
depths_dict
Out[38]:
In [7]:
# Load up the list of taxonomic profilers
taxonomic_profilers = ['utree', 'burst', 'burst.taxonomy', 'bowtie2']
# Select BURST capitalist
In [8]:
hit_rate = {}
# Compare samples to mapping file
fulldepth_df = pd.read_csv("../results/karlsson2013/fulldepth_1/taxatable.{}.species.txt".format(TAXONOMIC_PROFILER), sep="\t", index_col=0)
sample_ids = [int(_.split('.')[1]) for _ in fulldepth_df.columns]
temp_fulldepth_df = fulldepth_df.T
hit_rate['fulldepth'] = fulldepth_df.sum().sum()/depths_dict['fulldepth']
temp_fulldepth_df['#Sample ID'] = sample_ids
join_df = temp_fulldepth_df.merge(mapping_df, on="#Sample ID", how="inner")
join_df["Classification "].describe()
Out[8]:
In [9]:
pd.value_counts(join_df["Classification "])
Out[9]:
In [10]:
classes = set(('IGT', 'NGT', 'T2D'))
join_df = join_df[join_df['Classification '].isin(classes)]
join_df.index = join_df["#Sample ID"]
join_df = join_df.sort_index()
In [11]:
def normalize_taxatable(df, prev=lambda df: int(df.shape[1]/2), percent=0, map_sample_ids=True, normalize_relative_abundance=True):
# Features X Samples
# Redistribute to median depth
df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
if map_sample_ids:
sample_ids = [int(_.split('.')[1]) for _ in df.columns]
df.columns = sample_ids
if normalize_relative_abundance:
df = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df.apply(lambda x: x.mean(), axis=1) > percent]
else:
df_temp = df.apply(lambda x: x/x.sum(), axis=0)
df = df[df_temp.apply(lambda x: x.mean(), axis=1) > percent]
# print(df.apply(lambda x: (x > 0).sum(), axis=1))
df = df[df.apply(lambda x: (x >= 1).sum(), axis=1) > prev(df)]
if normalize_relative_abundance:
df = df.apply(lambda x: x/x.sum(), axis=0)
return df.sort_index(axis=0).sort_index(axis=1)
In [12]:
# Normalize the full depth taxatable
fulldepth_df = normalize_taxatable(fulldepth_df, percent=.0001, normalize_relative_abundance=False)
fulldepth_df[fulldepth_df == 0] = .65
fulldepth_df = fulldepth_df.apply(lambda x: x/x.sum(), axis=0)
In [13]:
fulldepth_df.head()
Out[13]:
In [14]:
if CLR:
data = fulldepth_df.T.values
data = clr(data)
fulldepth_df = pd.DataFrame(data, index=fulldepth_df.columns, columns=fulldepth_df.index)
else:
fulldepth_df = fulldepth_df.T
In [15]:
fulldepth_df.shape
Out[15]:
In [16]:
depths = ["1", "01", "001", "0001"]
dice_coefficients = []
df_depths = {'fulldepth': fulldepth_df}
for depth in depths:
depth_df = pd.read_csv("../results/karlsson2013/{}_1/taxatable.{}.species.normalized.txt".format(depth, TAXONOMIC_PROFILER), sep="\t", index_col=0)
sample_ids = [int(_.split('.')[1]) for _ in depth_df.columns]
hit_rate[depth] = depth_df.sum().sum()/depths_dict[depth]
depth_df.columns = sample_ids
temp = pd.DataFrame(np.zeros(fulldepth_df.shape), index=fulldepth_df.index, columns=fulldepth_df.columns)
for index in depth_df.index:
if index in temp.columns:
temp[str(index)] = depth_df.loc[str(index), :]
depth_df = temp
depth_df = normalize_taxatable(depth_df.T, prev=lambda df: 0, map_sample_ids=False, normalize_relative_abundance=False)
for _, row in depth_df.T.iterrows():
dice_coefficients.append([np.log10(float("." + str(depth))), dice(row, np.ones(len(row))), TAXONOMIC_PROFILER])
depth_df[depth_df == 0] = .65
depth_df = depth_df.apply(lambda x: x/x.sum(), axis=0)
depth_df = depth_df.T
if CLR:
data = depth_df.values
data = clr(data)
depth_df = pd.DataFrame(data, index=depth_df.index, columns=depth_df.columns)
df_depths[depth] = depth_df
In [17]:
df_dice = pd.DataFrame(dice_coefficients, columns=['depth', 'correlation', 'profiler'])
In [18]:
depth_df.head()
Out[18]:
In [19]:
hit_rate
# multiple box plots on one figure
fig = plt.figure()
plt.boxplot(list(hit_rate.values()), labels=[""])
plt.ylabel("Percent of Reads that Hit the Database")
plt.title("Hit Rate")
save_plot(fig, "hit_rate")
In [20]:
def multiclass_roc(X, y, classifier=CLASSIFIER(), classes=['IGT', 'NGT', 'T2D']):
# Binarize the output
n_classes = len(classes)
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return fpr, tpr, roc_auc
In [21]:
def plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=2, classes=['IGT', 'NGT', 'T2D']):
fig = plt.figure()
lw = 2
plt.plot(fpr[ix], tpr[ix], color=COLORS[0],
lw=lw, label='%s ROC curve (area = %0.2f)' % (classes[ix], roc_auc[ix]), alpha=.8)
plt.plot([0, 1], [0, 1], color=COLORS[1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if depth != 'fulldepth':
plt.title('Receiver Operating Characteristic %s depth=.%s' % (classes[ix], str(depth)))
else:
plt.title('Receiver Operating Characteristic %s' % (classes[ix]))
plt.legend(loc="lower right")
save_plot(fig, 'roc_{}_{}'.format(depth, classes[ix]))
# Compute macro-average ROC curve and ROC area
def plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D']):
lw=2
n_classes = len(classes)
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
fig = plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4, alpha=.8)
for i, color, target_name in zip(range(n_classes), COLORS, classes):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'.format(target_name, roc_auc[i]), alpha=.8)
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if depth != 'fulldepth':
plt.title('Multiclass Receiver Operating Characteristic depth=.%s' % str(depth))
else:
plt.title('Multiclass Receiver Operating Characteristic')
plt.legend(loc="lower right")
save_plot(fig, 'multiroc_{}'.format(depth))
In [22]:
for depth, df in df_depths.items():
temp_y = join_df["Classification "].values
temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
# mask = temp_y[:, 0] == 0
# df = df[mask]
# temp_y = temp_y[mask]
# temp_y = temp_y[:, 1:]
# Make full depth classifier with 1-CV
fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['IGT', 'NGT', 'T2D'])
plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D'])
# for i in range(3):
# plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)
In [23]:
if RUN_TWO_CLASS_ROC_AUC:
for depth, df in df_depths.items():
temp_y = join_df["Classification "].values
temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
mask = temp_y[:, 0] == 0
df = df[mask]
temp_y = temp_y[mask]
temp_y = temp_y[:, 1:]
# Make full depth classifier with 1-CV
fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['NGT', 'T2D'])
# for i in range(3):
# plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)
In [40]:
result_cv_scores = []
for depth, df in df_depths.items():
temp_y = join_df["Classification "].values
temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
mask = temp_y[:, 0] == 0
df = df[mask]
temp_y = temp_y[mask]
temp_y = temp_y[:, 1:]
# Make full depth classifier with 1-CV
fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
# Glitches at past 5
scores = cross_val_score(CLASSIFIER(), df, temp_y, cv=5, scoring="roc_auc")
for score in scores:
if depth == 'fulldepth':
depth_name = 0
else:
depth_name = np.log10(float("." + str(depth)))
result_cv_scores.append([depth_name, score, TAXONOMIC_PROFILER])
for i in range(len(CLASSES)):
plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i, classes=CLASSES)
df_result_cv_scores = pd.DataFrame(result_cv_scores, columns=['depth', 'score', 'profiler'])
In [41]:
# df_result_cv_scores['mean'] = df_result_cv_scores.values[:,1:].mean(axis=1)
# df_result_cv_scores = df_result_cv_scores.sort_values('mean')
df_result_cv_scores.to_csv('../figures/karlsson2013_roc_cv_table.txt')
g = sns.factorplot(x="depth", y="score", kind="box", data=df_result_cv_scores, palette=COLORS, legend=False)
g.set_ylabels("ROC AUC").set_xlabels("Depth (log10)")
plt.title("Classification Accuracy by Depth")
pltname = "roc_auc"
save_plot(g, pltname)
In [26]:
def plot_pca(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
n_classes = len(classes)
fulldepth_df = df_depths['fulldepth']
pca = PCA(n_components=2)
pca.fit(fulldepth_df)
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
for depth, df in df_depths.items():
X_r = pca.transform(df)
fig = plt.figure()
lw = 2
for color, target_name in zip(colors, classes):
plt.scatter(X_r[y == target_name, 0], X_r[y == target_name, 1], color=color, alpha=.8, lw=lw,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
if depth != 'fulldepth':
plt.title('PCA of Karlsson dataset depth=.%s' % str(depth))
else:
plt.title('PCA of Karlsson dataset')
save_plot(fig, 'pca_{}_nclasses{}'.format(depth, str(n_classes)))
In [27]:
df_depths2 = {}
for depth, df in df_depths.items():
temp_y = join_df["Classification "].values
mask = temp_y != 'IGT'
df = df[mask]
temp_y = temp_y[mask]
df_depths2[depth] = df
plot_pca(df_depths2, temp_y)
plot_pca(df_depths, join_df["Classification "].values)
plot_pca(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])
In [28]:
def plot_lda(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
fulldepth_df = df_depths['fulldepth']
n_classes = len(classes)
lda = LinearDiscriminantAnalysis(n_components=2)
X_r2 = lda.fit(fulldepth_df, y).transform(fulldepth_df)
lw = 2
for depth, df in df_depths.items():
X_r2 = lda.transform(df)
fig = plt.figure()
for color, target_name in zip(colors, classes):
plt.scatter(X_r2[y == target_name, 0], X_r2[y == target_name, 1], alpha=.8, color=color, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
if depth != 'fulldepth':
plt.title('LDA of Karlsson dataset depth=.%s' % str(depth))
else:
plt.title('LDA of Karlsson dataset')
save_plot(fig, 'lda_{}'.format(depth))
In [29]:
plot_lda(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])
In [30]:
# Spearman Correlation
from scipy.stats import pearsonr
def pearson_correlations(df_depths):
pearson_correlations = []
fulldepth_df = df_depths['fulldepth']
for depth, df in df_depths.items():
if depth != 'fulldepth':
for i, row in fulldepth_df.iterrows():
pearson_correlations.append([np.log10(float("." + str(depth))), pearsonr(row, df.loc[i])[0], 'burst'])
pearson_correlations.append([0, 1.0, 'burst'])
return pd.DataFrame(pearson_correlations, columns=['depth', 'correlation', 'profiler'])
In [31]:
pearson_df = pearson_correlations(df_depths)
In [32]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=pearson_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels(" Correlation").set_xlabels("Depth (log10)")
plt.title("Karlsson Pearson Correlation with Full Depth")
pltname = "pearson_correlation"
save_plot(g, pltname)
In [33]:
# Spearman Correlation
from scipy.stats import spearmanr
def spearman_correlations(df_depths):
spearman_correlations = []
fulldepth_df = df_depths['fulldepth']
for depth, df in df_depths.items():
if depth != 'fulldepth':
for i, row in fulldepth_df.iterrows():
spearman_correlations.append([np.log10(float("." + str(depth))), spearmanr(row, df.loc[i])[0], 'burst'])
spearman_correlations.append([0, 1.0, 'burst'])
return pd.DataFrame(spearman_correlations, columns=['depth', 'correlation', 'profiler'])
In [34]:
spearman_df = spearman_correlations(df_depths)
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=spearman_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Spearman Correlation").set_xlabels("Depth (log10)")
plt.title("Spearman Correlation with Full Depth")
pltname = "spearman_correlation"
save_plot(g, pltname)
In [35]:
from skbio.stats.composition import ancom, clr_inv
results = []
for depth, df in df_depths.items():
if CLR:
data = clr_inv(df)
df = pd.DataFrame(data, index=df.index, columns=df.columns)
results.append([depth, ancom(df, join_df["Classification "])])
In [36]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=df_dice, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Dice Coefficients").set_xlabels("Depth (log10)")
plt.title("Dice Coefficients with Full Depth")
pltname = "dice_coefficients"
save_plot(g, pltname)
In [37]:
for result in results:
print(result[1][0]['Reject null hypothesis'].sum())