In [1]:
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))

%matplotlib inline

In [2]:
COLORS = sns.color_palette("colorblind", 5)
TAXONOMIC_PROFILER = 'burst'
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ['NGT', 'T2D']
RUN_TWO_CLASS_ROC_AUC = False

In [3]:
def save_plot(fig, pltname, artists=()):
    fig.savefig(os.path.join("..", "figures", "karlsson2013_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')

In [4]:
# Load up the mapping file
mapping_df = pd.read_csv('../data/karlsson2013/map.txt', sep="\t")
mapping_df.dtypes


Out[4]:
#Sample ID                                                                                  int64
Age (years)                                                                               float64
Classification                                                                             object
GAD-antibodies (units1)                                                                   float64
BMI (kg/m2)                                                                               float64
WHR (cm/cm)                                                                               float64
WC (cm)                                                                                     int64
Cholesterol (mmol/L)                                                                      float64
Triglycerides (mmol/L)                                                                    float64
HDL (mmol/L)                                                                              float64
LDL (mmol/L)                                                                              float64
Creatinine (µmol/L)                                                                         int64
ƴ-GT (µkat/L)                                                                             float64
Fasting glucose (mmol/L)                                                                   object
Fasting Insulin (mU/L)                                                                    float64
HbA1c (mmol/mol)                                                                          float64
Adiponectin (mg/L)                                                                         object
Leptin (µg/L)                                                                              object
GLP-1 (pmol/L)                                                                             object
FGF-19 (pg/ml)                                                                             object
hsCRP (mg/L)                                                                              float64
C-peptide (nmol/L)                                                                        float64
TNFα (ng/L)                                                                               float64
IL-1 (pg/ml)                                                                               object
CD163 (ng/ml)                                                                              object
Statins (Y, N)                                                                             object
Insulin (Y, N)                                                                             object
Oral anti-diabetic medication (-, no medication; Met, metformin; Sulph, sulphonylurea)     object
Country of birth                                                                           object
Years in Sweden                                                                            object
dtype: object

In [5]:
# What are these classifications?
set(mapping_df['Classification '])
# Disease Classification NGT = normal glucose tolerance; IGT = intermediate gluc tolerance (i.e. pre-diabetic); T2D = type 2 diabetes


Out[5]:
{'IGT', 'NGT', 'T2D'}

In [6]:
# Get the fasta depths of each
depths_dict = {}
files = glob.glob("../results/karlsson2013/**/count.seqs.txt")
for file in files:
    sequence_depth = !cat {file}
    depth = file.split('/')[-2].split('_')[0]
    depths_dict[depth] =  int(sequence_depth[0])

In [38]:
depths_dict


Out[38]:
{'0001': 144452,
 '001': 1442792,
 '01': 14410781,
 '1': 144140740,
 'fulldepth': 1441299337}

In [7]:
# Load up the list of taxonomic profilers
taxonomic_profilers = ['utree', 'burst', 'burst.taxonomy', 'bowtie2']
# Select BURST capitalist

In [8]:
hit_rate = {}
# Compare samples to mapping file
fulldepth_df = pd.read_csv("../results/karlsson2013/fulldepth_1/taxatable.{}.species.txt".format(TAXONOMIC_PROFILER), sep="\t", index_col=0)
sample_ids = [int(_.split('.')[1]) for _ in fulldepth_df.columns]
temp_fulldepth_df = fulldepth_df.T
hit_rate['fulldepth'] = fulldepth_df.sum().sum()/depths_dict['fulldepth']
temp_fulldepth_df['#Sample ID'] = sample_ids
join_df = temp_fulldepth_df.merge(mapping_df, on="#Sample ID", how="inner")
join_df["Classification "].describe()


Out[8]:
count     144
unique      3
top       T2D
freq       53
Name: Classification , dtype: object

In [9]:
pd.value_counts(join_df["Classification "])


Out[9]:
T2D    53
IGT    48
NGT    43
Name: Classification , dtype: int64

In [10]:
classes = set(('IGT', 'NGT', 'T2D'))
join_df = join_df[join_df['Classification '].isin(classes)]
join_df.index = join_df["#Sample ID"]
join_df = join_df.sort_index()

In [11]:
def normalize_taxatable(df, prev=lambda df: int(df.shape[1]/2), percent=0, map_sample_ids=True, normalize_relative_abundance=True):
    # Features X Samples
    # Redistribute to median depth
    df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
    if map_sample_ids:
        sample_ids = [int(_.split('.')[1]) for _ in df.columns]
        df.columns = sample_ids
    if normalize_relative_abundance:
        df = df.apply(lambda x: x/x.sum(), axis=0)
        df = df[df.apply(lambda x: x.mean(), axis=1) > percent]
    else:
        df_temp = df.apply(lambda x: x/x.sum(), axis=0)
        df = df[df_temp.apply(lambda x: x.mean(), axis=1) > percent]
#     print(df.apply(lambda x: (x > 0).sum(), axis=1))
    df = df[df.apply(lambda x: (x >= 1).sum(), axis=1) > prev(df)]
    if normalize_relative_abundance:
        df = df.apply(lambda x: x/x.sum(), axis=0)
    return df.sort_index(axis=0).sort_index(axis=1)

In [12]:
# Normalize the full depth taxatable
fulldepth_df = normalize_taxatable(fulldepth_df, percent=.0001, normalize_relative_abundance=False)
fulldepth_df[fulldepth_df == 0] = .65
fulldepth_df = fulldepth_df.apply(lambda x: x/x.sum(), axis=0)

In [13]:
fulldepth_df.head()


Out[13]:
51 53 54 58 59 60 77 80 88 92 ... 604 614 617 618 628 638 640 643 646 655
#OTU ID
k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii 0.000036 0.015895 0.059891 0.016667 0.045543 0.002146 2.179714e-02 0.019733 0.067902 8.975381e-06 ... 3.469546e-07 0.000002 2.265747e-03 1.028675e-06 0.014766 0.000008 2.425175e-06 0.043161 0.000006 0.000067
k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3 0.000654 0.003896 0.036820 0.008164 0.007381 0.008369 1.929783e-02 0.001721 0.000759 1.935886e-02 ... 1.209484e-03 0.012294 1.188774e-02 1.176701e-02 0.019588 0.002573 5.422553e-02 0.002703 0.000245 0.005952
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis 0.012508 0.036187 0.008047 0.003052 0.012644 0.013321 1.409705e-05 0.041603 0.001616 8.284967e-06 ... 4.545106e-05 0.000713 3.805072e-05 2.029061e-02 0.010492 0.000357 1.455105e-05 0.004639 0.000005 0.156644
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum 0.000067 0.000271 0.000079 0.000709 0.000111 0.000221 6.876611e-07 0.000522 0.000029 6.904139e-06 ... 3.851196e-05 0.000225 1.833353e-05 1.769321e-04 0.000084 0.000007 2.251948e-07 0.000036 0.000002 0.000814
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis 0.000002 0.000013 0.000003 0.001868 0.000004 0.000042 1.719153e-06 0.000003 0.001370 3.452070e-07 ... 3.469546e-07 0.000002 2.248451e-07 2.228795e-07 0.000005 0.000008 6.236164e-06 0.000036 0.000003 0.005782

5 rows × 144 columns


In [14]:
if CLR:
    data = fulldepth_df.T.values
    data = clr(data)
    fulldepth_df = pd.DataFrame(data, index=fulldepth_df.columns, columns=fulldepth_df.index)
else:
    fulldepth_df = fulldepth_df.T

In [15]:
fulldepth_df.shape


Out[15]:
(144, 202)

In [16]:
depths = ["1", "01", "001", "0001"]
dice_coefficients = []
df_depths = {'fulldepth': fulldepth_df}
for depth in depths:
    depth_df = pd.read_csv("../results/karlsson2013/{}_1/taxatable.{}.species.normalized.txt".format(depth, TAXONOMIC_PROFILER), sep="\t", index_col=0)
    sample_ids = [int(_.split('.')[1]) for _ in depth_df.columns]
    hit_rate[depth] = depth_df.sum().sum()/depths_dict[depth]
    depth_df.columns = sample_ids
    temp = pd.DataFrame(np.zeros(fulldepth_df.shape), index=fulldepth_df.index, columns=fulldepth_df.columns)
    for index in depth_df.index:
        if index in temp.columns:
            temp[str(index)] = depth_df.loc[str(index), :]
    depth_df = temp
    depth_df = normalize_taxatable(depth_df.T, prev=lambda df: 0, map_sample_ids=False, normalize_relative_abundance=False)
    for _, row in depth_df.T.iterrows():
        dice_coefficients.append([np.log10(float("." + str(depth))), dice(row, np.ones(len(row))), TAXONOMIC_PROFILER])
    depth_df[depth_df == 0] = .65
    depth_df = depth_df.apply(lambda x: x/x.sum(), axis=0)
    depth_df = depth_df.T
    if CLR:
        data = depth_df.values
        data = clr(data)
        depth_df = pd.DataFrame(data, index=depth_df.index, columns=depth_df.columns)
    df_depths[depth] = depth_df

In [17]:
df_dice = pd.DataFrame(dice_coefficients, columns=['depth', 'correlation', 'profiler'])

In [18]:
depth_df.head()


Out[18]:
#OTU ID k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3 k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_bifidum k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_breve k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_dentium k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_longum k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Cellulomonadaceae;g__Cellulomonas;s__Cellulomonas_carbonis ... k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__Desulfovibrio_piger k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Citrobacter;s__Citrobacter_freundii k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella_pneumoniae k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_dysenteriae k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_flexneri k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_muciniphila k__BacteriaPlasmid;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides_thetaiotaomicron k__BacteriaPlasmid;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Eubacterium;s__[Eubacterium]_eligens k__BacteriaPlasmid;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
51 -0.385302 -0.385302 1.431775 -0.385302 -0.385302 -0.385302 -0.385302 -0.385302 0.045481 -0.385302 ... -0.385302 -0.385302 0.045481 -0.385302 -0.385302 -0.385302 1.654919 -0.385302 -0.385302 -0.385302
53 1.082507 -0.446888 2.381790 -0.446888 -0.446888 -0.446888 -0.446888 -0.016105 -0.446888 -0.446888 ... -0.446888 -0.446888 -0.446888 -0.446888 -0.446888 -0.446888 3.351190 -0.446888 0.677042 -0.446888
54 3.213817 2.520670 1.422058 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 1.134375 -0.395020 ... -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020 -0.395020
58 1.908160 1.060862 -0.037750 -0.468533 0.655397 -0.468533 -0.037750 -0.468533 1.348544 -0.468533 ... -0.468533 -0.468533 -0.037750 -0.468533 -0.468533 -0.468533 1.348544 -0.468533 -0.468533 -0.468533
59 1.874714 1.027417 1.027417 -0.501979 -0.501979 -0.501979 -0.501979 -0.501979 0.621951 -0.501979 ... -0.501979 0.621951 0.621951 -0.501979 -0.501979 -0.501979 -0.501979 -0.501979 0.621951 -0.501979

5 rows × 202 columns


In [19]:
hit_rate
# multiple box plots on one figure
fig = plt.figure()
plt.boxplot(list(hit_rate.values()), labels=[""])
plt.ylabel("Percent of Reads that Hit the Database")
plt.title("Hit Rate")
save_plot(fig, "hit_rate")



In [20]:
def multiclass_roc(X, y, classifier=CLASSIFIER(), classes=['IGT', 'NGT', 'T2D']):
    # Binarize the output
    n_classes = len(classes)

    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                        random_state=0)

    # Learn to predict each class against the other
    y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return fpr, tpr, roc_auc

In [21]:
def plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=2, classes=['IGT', 'NGT', 'T2D']):
    fig = plt.figure()
    lw = 2
    plt.plot(fpr[ix], tpr[ix], color=COLORS[0],
             lw=lw, label='%s ROC curve (area = %0.2f)' % (classes[ix], roc_auc[ix]), alpha=.8)
    plt.plot([0, 1], [0, 1], color=COLORS[1], lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if depth != 'fulldepth':
        plt.title('Receiver Operating Characteristic %s depth=.%s' % (classes[ix], str(depth)))
    else:
        plt.title('Receiver Operating Characteristic %s' % (classes[ix]))
    plt.legend(loc="lower right")
    save_plot(fig, 'roc_{}_{}'.format(depth, classes[ix]))
    
# Compute macro-average ROC curve and ROC area
def plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D']):
    lw=2
    n_classes = len(classes)
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    fig = plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4, alpha=.8)

    for i, color, target_name in zip(range(n_classes), COLORS, classes):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'.format(target_name, roc_auc[i]), alpha=.8)

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if depth != 'fulldepth':
        plt.title('Multiclass Receiver Operating Characteristic depth=.%s' % str(depth))
    else:
        plt.title('Multiclass Receiver Operating Characteristic')    
    plt.legend(loc="lower right")
    save_plot(fig, 'multiroc_{}'.format(depth))

In [22]:
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
#     mask = temp_y[:, 0] == 0
#     df = df[mask]
#     temp_y = temp_y[mask]
#     temp_y = temp_y[:, 1:]
    # Make full depth classifier with 1-CV
    fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['IGT', 'NGT', 'T2D'])
    plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D'])
#     for i in range(3):
#         plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)



In [23]:
if RUN_TWO_CLASS_ROC_AUC:
    for depth, df in df_depths.items():
        temp_y = join_df["Classification "].values
        temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
        mask = temp_y[:, 0] == 0
        df = df[mask]
        temp_y = temp_y[mask]
        temp_y = temp_y[:, 1:]
        # Make full depth classifier with 1-CV
        fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
        plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['NGT', 'T2D'])
    #     for i in range(3):
    #         plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)

In [40]:
result_cv_scores = []
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
    mask = temp_y[:, 0] == 0
    df = df[mask]
    temp_y = temp_y[mask]
    temp_y = temp_y[:, 1:]
    # Make full depth classifier with 1-CV
    fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
    # Glitches at past 5
    scores = cross_val_score(CLASSIFIER(), df, temp_y, cv=5, scoring="roc_auc")
    for score in scores:
        if depth == 'fulldepth':
            depth_name = 0
        else:
            depth_name = np.log10(float("." + str(depth)))
        result_cv_scores.append([depth_name, score, TAXONOMIC_PROFILER])
    for i in range(len(CLASSES)):
        plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i, classes=CLASSES)
df_result_cv_scores = pd.DataFrame(result_cv_scores, columns=['depth', 'score', 'profiler'])



In [41]:
# df_result_cv_scores['mean'] = df_result_cv_scores.values[:,1:].mean(axis=1)
# df_result_cv_scores = df_result_cv_scores.sort_values('mean')
df_result_cv_scores.to_csv('../figures/karlsson2013_roc_cv_table.txt')
g = sns.factorplot(x="depth", y="score", kind="box", data=df_result_cv_scores, palette=COLORS, legend=False)
g.set_ylabels("ROC AUC").set_xlabels("Depth (log10)")
plt.title("Classification Accuracy by Depth")
pltname = "roc_auc"
save_plot(g, pltname)



In [26]:
def plot_pca(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
    n_classes = len(classes)
    fulldepth_df = df_depths['fulldepth']
    pca = PCA(n_components=2)
    pca.fit(fulldepth_df)

    # Percentage of variance explained for each components
    print('explained variance ratio (first two components): %s'
          % str(pca.explained_variance_ratio_))
    
    for depth, df in df_depths.items():
        X_r = pca.transform(df)
        fig = plt.figure()
        lw = 2
        
        
        for color, target_name in zip(colors, classes):
            plt.scatter(X_r[y == target_name, 0], X_r[y == target_name, 1], color=color, alpha=.8, lw=lw,
                        label=target_name)
        plt.legend(loc='best', shadow=False, scatterpoints=1)
        if depth != 'fulldepth':
            plt.title('PCA of Karlsson dataset depth=.%s' % str(depth))
        else:
            plt.title('PCA of Karlsson dataset')
        save_plot(fig, 'pca_{}_nclasses{}'.format(depth, str(n_classes)))

In [27]:
df_depths2 = {}
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    mask = temp_y != 'IGT'
    df = df[mask]
    temp_y = temp_y[mask]
    df_depths2[depth] = df  

plot_pca(df_depths2, temp_y)
plot_pca(df_depths, join_df["Classification "].values)
plot_pca(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])


explained variance ratio (first two components): [ 0.11548391  0.07899261]
explained variance ratio (first two components): [ 0.10989399  0.07376953]
explained variance ratio (first two components): [ 0.10989399  0.07376953]

In [28]:
def plot_lda(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
    fulldepth_df = df_depths['fulldepth']
    n_classes = len(classes)
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_r2 = lda.fit(fulldepth_df, y).transform(fulldepth_df)
    
    lw = 2
    
    for depth, df in df_depths.items():
        X_r2 = lda.transform(df)
        fig = plt.figure()
        for color, target_name in zip(colors, classes):
            plt.scatter(X_r2[y == target_name, 0], X_r2[y == target_name, 1], alpha=.8, color=color, lw=lw, label=target_name)
        plt.legend(loc='best', shadow=False, scatterpoints=1)
        if depth != 'fulldepth':
            plt.title('LDA of Karlsson dataset depth=.%s' % str(depth))
        else:
            plt.title('LDA of Karlsson dataset')
        save_plot(fig, 'lda_{}'.format(depth))

In [29]:
plot_lda(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])


/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

In [30]:
# Spearman Correlation
from scipy.stats import pearsonr

def pearson_correlations(df_depths):
    pearson_correlations = []
    fulldepth_df = df_depths['fulldepth']
    for depth, df in df_depths.items():
        if depth != 'fulldepth':
            for i, row in fulldepth_df.iterrows():
                pearson_correlations.append([np.log10(float("." + str(depth))), pearsonr(row, df.loc[i])[0], 'burst'])
    pearson_correlations.append([0, 1.0, 'burst'])
    return pd.DataFrame(pearson_correlations, columns=['depth', 'correlation', 'profiler'])

In [31]:
pearson_df = pearson_correlations(df_depths)

In [32]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=pearson_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels(" Correlation").set_xlabels("Depth (log10)")
plt.title("Karlsson Pearson Correlation with Full Depth")
pltname = "pearson_correlation"
save_plot(g, pltname)



In [33]:
# Spearman Correlation
from scipy.stats import spearmanr

def spearman_correlations(df_depths):
    spearman_correlations = []
    fulldepth_df = df_depths['fulldepth']
    for depth, df in df_depths.items():
        if depth != 'fulldepth':
            for i, row in fulldepth_df.iterrows():
                spearman_correlations.append([np.log10(float("." + str(depth))), spearmanr(row, df.loc[i])[0], 'burst'])
    spearman_correlations.append([0, 1.0, 'burst'])
    return pd.DataFrame(spearman_correlations, columns=['depth', 'correlation', 'profiler'])

In [34]:
spearman_df = spearman_correlations(df_depths)
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=spearman_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Spearman Correlation").set_xlabels("Depth (log10)")
plt.title("Spearman Correlation with Full Depth")
pltname = "spearman_correlation"
save_plot(g, pltname)



In [35]:
from skbio.stats.composition import ancom, clr_inv

results = []
for depth, df in df_depths.items():
    if CLR:
        data = clr_inv(df)
        df = pd.DataFrame(data, index=df.index, columns=df.columns)
    results.append([depth, ancom(df, join_df["Classification "])])

In [36]:
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=df_dice, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Dice Coefficients").set_xlabels("Depth (log10)")
plt.title("Dice Coefficients with Full Depth")
pltname = "dice_coefficients"
save_plot(g, pltname)



In [37]:
for result in results:
    print(result[1][0]['Reject null hypothesis'].sum())


0
0
0
0
202