notebook.community

Edit and run



In [1]:

    
import numpy as np
import math
import csv
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from skbio.stats.composition import clr
from scipy.spatial.distance import dice
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

sns.set_style("ticks")
sns.palplot(sns.color_palette("colorblind", 10))

%matplotlib inline



In [2]:

    
COLORS = sns.color_palette("colorblind", 5)
TAXONOMIC_PROFILER = 'burst'
CLR = True
# CLASSIFIER = lambda: OneVsRestClassifier(RandomForestClassifier(n_estimators=12))
CLASSIFIER = lambda: OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=True))
CLASSES = ['NGT', 'T2D']
RUN_TWO_CLASS_ROC_AUC = False



In [3]:

    
def save_plot(fig, pltname, artists=()):
    fig.savefig(os.path.join("..", "figures", "karlsson2013_" + pltname + ".png"), dpi=300, bbox_extra_artists=artists, bbox_inches='tight')



In [4]:

    
# Load up the mapping file
mapping_df = pd.read_csv('../data/karlsson2013/map.txt', sep="\t")
mapping_df.dtypes









    Out[4]:





#Sample ID                                                                                  int64
Age (years)                                                                               float64
Classification                                                                             object
GAD-antibodies (units1)                                                                   float64
BMI (kg/m2)                                                                               float64
WHR (cm/cm)                                                                               float64
WC (cm)                                                                                     int64
Cholesterol (mmol/L)                                                                      float64
Triglycerides (mmol/L)                                                                    float64
HDL (mmol/L)                                                                              float64
LDL (mmol/L)                                                                              float64
Creatinine (µmol/L)                                                                         int64
ƴ-GT (µkat/L)                                                                             float64
Fasting glucose (mmol/L)                                                                   object
Fasting Insulin (mU/L)                                                                    float64
HbA1c (mmol/mol)                                                                          float64
Adiponectin (mg/L)                                                                         object
Leptin (µg/L)                                                                              object
GLP-1 (pmol/L)                                                                             object
FGF-19 (pg/ml)                                                                             object
hsCRP (mg/L)                                                                              float64
C-peptide (nmol/L)                                                                        float64
TNFα (ng/L)                                                                               float64
IL-1 (pg/ml)                                                                               object
CD163 (ng/ml)                                                                              object
Statins (Y, N)                                                                             object
Insulin (Y, N)                                                                             object
Oral anti-diabetic medication (-, no medication; Met, metformin; Sulph, sulphonylurea)     object
Country of birth                                                                           object
Years in Sweden                                                                            object
dtype: object



In [5]:

    
# What are these classifications?
set(mapping_df['Classification '])
# Disease Classification NGT = normal glucose tolerance; IGT = intermediate gluc tolerance (i.e. pre-diabetic); T2D = type 2 diabetes









    Out[5]:





{'IGT', 'NGT', 'T2D'}



In [6]:

    
# Get the fasta depths of each
depths_dict = {}
files = glob.glob("../results/karlsson2013/**/count.seqs.txt")
for file in files:
    sequence_depth = !cat {file}
    depth = file.split('/')[-2].split('_')[0]
    depths_dict[depth] =  int(sequence_depth[0])



In [38]:

    
depths_dict









    Out[38]:





{'0001': 144452,
 '001': 1442792,
 '01': 14410781,
 '1': 144140740,
 'fulldepth': 1441299337}



In [7]:

    
# Load up the list of taxonomic profilers
taxonomic_profilers = ['utree', 'burst', 'burst.taxonomy', 'bowtie2']
# Select BURST capitalist



In [8]:

    
hit_rate = {}
# Compare samples to mapping file
fulldepth_df = pd.read_csv("../results/karlsson2013/fulldepth_1/taxatable.{}.species.txt".format(TAXONOMIC_PROFILER), sep="\t", index_col=0)
sample_ids = [int(_.split('.')[1]) for _ in fulldepth_df.columns]
temp_fulldepth_df = fulldepth_df.T
hit_rate['fulldepth'] = fulldepth_df.sum().sum()/depths_dict['fulldepth']
temp_fulldepth_df['#Sample ID'] = sample_ids
join_df = temp_fulldepth_df.merge(mapping_df, on="#Sample ID", how="inner")
join_df["Classification "].describe()









    Out[8]:





count     144
unique      3
top       T2D
freq       53
Name: Classification , dtype: object



In [9]:

    
pd.value_counts(join_df["Classification "])









    Out[9]:





T2D    53
IGT    48
NGT    43
Name: Classification , dtype: int64



In [10]:

    
classes = set(('IGT', 'NGT', 'T2D'))
join_df = join_df[join_df['Classification '].isin(classes)]
join_df.index = join_df["#Sample ID"]
join_df = join_df.sort_index()



In [11]:

    
def normalize_taxatable(df, prev=lambda df: int(df.shape[1]/2), percent=0, map_sample_ids=True, normalize_relative_abundance=True):
    # Features X Samples
    # Redistribute to median depth
    df = df.div(df.sum(axis=0).div(df.sum(axis=0).median()), axis=1).round().astype(int)
    if map_sample_ids:
        sample_ids = [int(_.split('.')[1]) for _ in df.columns]
        df.columns = sample_ids
    if normalize_relative_abundance:
        df = df.apply(lambda x: x/x.sum(), axis=0)
        df = df[df.apply(lambda x: x.mean(), axis=1) > percent]
    else:
        df_temp = df.apply(lambda x: x/x.sum(), axis=0)
        df = df[df_temp.apply(lambda x: x.mean(), axis=1) > percent]
#     print(df.apply(lambda x: (x > 0).sum(), axis=1))
    df = df[df.apply(lambda x: (x >= 1).sum(), axis=1) > prev(df)]
    if normalize_relative_abundance:
        df = df.apply(lambda x: x/x.sum(), axis=0)
    return df.sort_index(axis=0).sort_index(axis=1)



In [12]:

    
# Normalize the full depth taxatable
fulldepth_df = normalize_taxatable(fulldepth_df, percent=.0001, normalize_relative_abundance=False)
fulldepth_df[fulldepth_df == 0] = .65
fulldepth_df = fulldepth_df.apply(lambda x: x/x.sum(), axis=0)



In [13]:

    
fulldepth_df.head()









    Out[13]:







  
    
      
      51
      53
      54
      58
      59
      60
      77
      80
      88
      92
      ...
      604
      614
      617
      618
      628
      638
      640
      643
      646
      655
    
    
      #OTU ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii
      0.000036
      0.015895
      0.059891
      0.016667
      0.045543
      0.002146
      2.179714e-02
      0.019733
      0.067902
      8.975381e-06
      ...
      3.469546e-07
      0.000002
      2.265747e-03
      1.028675e-06
      0.014766
      0.000008
      2.425175e-06
      0.043161
      0.000006
      0.000067
    
    
      k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3
      0.000654
      0.003896
      0.036820
      0.008164
      0.007381
      0.008369
      1.929783e-02
      0.001721
      0.000759
      1.935886e-02
      ...
      1.209484e-03
      0.012294
      1.188774e-02
      1.176701e-02
      0.019588
      0.002573
      5.422553e-02
      0.002703
      0.000245
      0.005952
    
    
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis
      0.012508
      0.036187
      0.008047
      0.003052
      0.012644
      0.013321
      1.409705e-05
      0.041603
      0.001616
      8.284967e-06
      ...
      4.545106e-05
      0.000713
      3.805072e-05
      2.029061e-02
      0.010492
      0.000357
      1.455105e-05
      0.004639
      0.000005
      0.156644
    
    
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum
      0.000067
      0.000271
      0.000079
      0.000709
      0.000111
      0.000221
      6.876611e-07
      0.000522
      0.000029
      6.904139e-06
      ...
      3.851196e-05
      0.000225
      1.833353e-05
      1.769321e-04
      0.000084
      0.000007
      2.251948e-07
      0.000036
      0.000002
      0.000814
    
    
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis
      0.000002
      0.000013
      0.000003
      0.001868
      0.000004
      0.000042
      1.719153e-06
      0.000003
      0.001370
      3.452070e-07
      ...
      3.469546e-07
      0.000002
      2.248451e-07
      2.228795e-07
      0.000005
      0.000008
      6.236164e-06
      0.000036
      0.000003
      0.005782
    
  

5 rows × 144 columns



In [14]:

    
if CLR:
    data = fulldepth_df.T.values
    data = clr(data)
    fulldepth_df = pd.DataFrame(data, index=fulldepth_df.columns, columns=fulldepth_df.index)
else:
    fulldepth_df = fulldepth_df.T



In [15]:

    
fulldepth_df.shape









    Out[15]:





(144, 202)



In [16]:

    
depths = ["1", "01", "001", "0001"]
dice_coefficients = []
df_depths = {'fulldepth': fulldepth_df}
for depth in depths:
    depth_df = pd.read_csv("../results/karlsson2013/{}_1/taxatable.{}.species.normalized.txt".format(depth, TAXONOMIC_PROFILER), sep="\t", index_col=0)
    sample_ids = [int(_.split('.')[1]) for _ in depth_df.columns]
    hit_rate[depth] = depth_df.sum().sum()/depths_dict[depth]
    depth_df.columns = sample_ids
    temp = pd.DataFrame(np.zeros(fulldepth_df.shape), index=fulldepth_df.index, columns=fulldepth_df.columns)
    for index in depth_df.index:
        if index in temp.columns:
            temp[str(index)] = depth_df.loc[str(index), :]
    depth_df = temp
    depth_df = normalize_taxatable(depth_df.T, prev=lambda df: 0, map_sample_ids=False, normalize_relative_abundance=False)
    for _, row in depth_df.T.iterrows():
        dice_coefficients.append([np.log10(float("." + str(depth))), dice(row, np.ones(len(row))), TAXONOMIC_PROFILER])
    depth_df[depth_df == 0] = .65
    depth_df = depth_df.apply(lambda x: x/x.sum(), axis=0)
    depth_df = depth_df.T
    if CLR:
        data = depth_df.values
        data = clr(data)
        depth_df = pd.DataFrame(data, index=depth_df.index, columns=depth_df.columns)
    df_depths[depth] = depth_df



In [17]:

    
df_dice = pd.DataFrame(dice_coefficients, columns=['depth', 'correlation', 'profiler'])



In [18]:

    
depth_df.head()









    Out[18]:







  
    
      #OTU ID
      k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii
      k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_bifidum
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_breve
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_dentium
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_longum
      k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Cellulomonadaceae;g__Cellulomonas;s__Cellulomonas_carbonis
      ...
      k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__Desulfovibrio_piger
      k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Citrobacter;s__Citrobacter_freundii
      k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
      k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella_pneumoniae
      k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_dysenteriae
      k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_flexneri
      k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_muciniphila
      k__BacteriaPlasmid;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides_thetaiotaomicron
      k__BacteriaPlasmid;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Eubacterium;s__[Eubacterium]_eligens
      k__BacteriaPlasmid;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
    
  
  
    
      51
      -0.385302
      -0.385302
      1.431775
      -0.385302
      -0.385302
      -0.385302
      -0.385302
      -0.385302
      0.045481
      -0.385302
      ...
      -0.385302
      -0.385302
      0.045481
      -0.385302
      -0.385302
      -0.385302
      1.654919
      -0.385302
      -0.385302
      -0.385302
    
    
      53
      1.082507
      -0.446888
      2.381790
      -0.446888
      -0.446888
      -0.446888
      -0.446888
      -0.016105
      -0.446888
      -0.446888
      ...
      -0.446888
      -0.446888
      -0.446888
      -0.446888
      -0.446888
      -0.446888
      3.351190
      -0.446888
      0.677042
      -0.446888
    
    
      54
      3.213817
      2.520670
      1.422058
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      1.134375
      -0.395020
      ...
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
      -0.395020
    
    
      58
      1.908160
      1.060862
      -0.037750
      -0.468533
      0.655397
      -0.468533
      -0.037750
      -0.468533
      1.348544
      -0.468533
      ...
      -0.468533
      -0.468533
      -0.037750
      -0.468533
      -0.468533
      -0.468533
      1.348544
      -0.468533
      -0.468533
      -0.468533
    
    
      59
      1.874714
      1.027417
      1.027417
      -0.501979
      -0.501979
      -0.501979
      -0.501979
      -0.501979
      0.621951
      -0.501979
      ...
      -0.501979
      0.621951
      0.621951
      -0.501979
      -0.501979
      -0.501979
      -0.501979
      -0.501979
      0.621951
      -0.501979
    
  

5 rows × 202 columns



In [19]:

    
hit_rate
# multiple box plots on one figure
fig = plt.figure()
plt.boxplot(list(hit_rate.values()), labels=[""])
plt.ylabel("Percent of Reads that Hit the Database")
plt.title("Hit Rate")
save_plot(fig, "hit_rate")



In [20]:

    
def multiclass_roc(X, y, classifier=CLASSIFIER(), classes=['IGT', 'NGT', 'T2D']):
    # Binarize the output
    n_classes = len(classes)

    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                        random_state=0)

    # Learn to predict each class against the other
    y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return fpr, tpr, roc_auc



In [21]:

    
def plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=2, classes=['IGT', 'NGT', 'T2D']):
    fig = plt.figure()
    lw = 2
    plt.plot(fpr[ix], tpr[ix], color=COLORS[0],
             lw=lw, label='%s ROC curve (area = %0.2f)' % (classes[ix], roc_auc[ix]), alpha=.8)
    plt.plot([0, 1], [0, 1], color=COLORS[1], lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if depth != 'fulldepth':
        plt.title('Receiver Operating Characteristic %s depth=.%s' % (classes[ix], str(depth)))
    else:
        plt.title('Receiver Operating Characteristic %s' % (classes[ix]))
    plt.legend(loc="lower right")
    save_plot(fig, 'roc_{}_{}'.format(depth, classes[ix]))
    
# Compute macro-average ROC curve and ROC area
def plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D']):
    lw=2
    n_classes = len(classes)
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    fig = plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4, alpha=.8)

    for i, color, target_name in zip(range(n_classes), COLORS, classes):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'.format(target_name, roc_auc[i]), alpha=.8)

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if depth != 'fulldepth':
        plt.title('Multiclass Receiver Operating Characteristic depth=.%s' % str(depth))
    else:
        plt.title('Multiclass Receiver Operating Characteristic')    
    plt.legend(loc="lower right")
    save_plot(fig, 'multiroc_{}'.format(depth))



In [22]:

    
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
#     mask = temp_y[:, 0] == 0
#     df = df[mask]
#     temp_y = temp_y[mask]
#     temp_y = temp_y[:, 1:]
    # Make full depth classifier with 1-CV
    fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['IGT', 'NGT', 'T2D'])
    plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['IGT', 'NGT', 'T2D'])
#     for i in range(3):
#         plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)



In [23]:

    
if RUN_TWO_CLASS_ROC_AUC:
    for depth, df in df_depths.items():
        temp_y = join_df["Classification "].values
        temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
        mask = temp_y[:, 0] == 0
        df = df[mask]
        temp_y = temp_y[mask]
        temp_y = temp_y[:, 1:]
        # Make full depth classifier with 1-CV
        fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
        plot_multiroc_auc_classifier(depth, fpr, tpr, roc_auc, classes=['NGT', 'T2D'])
    #     for i in range(3):
    #         plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i)



In [40]:

    
result_cv_scores = []
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    temp_y = label_binarize(temp_y, classes=['IGT', 'NGT', 'T2D'])
    mask = temp_y[:, 0] == 0
    df = df[mask]
    temp_y = temp_y[mask]
    temp_y = temp_y[:, 1:]
    # Make full depth classifier with 1-CV
    fpr, tpr, roc_auc = multiclass_roc(df, temp_y, classes=['NGT', 'T2D'])
    # Glitches at past 5
    scores = cross_val_score(CLASSIFIER(), df, temp_y, cv=5, scoring="roc_auc")
    for score in scores:
        if depth == 'fulldepth':
            depth_name = 0
        else:
            depth_name = np.log10(float("." + str(depth)))
        result_cv_scores.append([depth_name, score, TAXONOMIC_PROFILER])
    for i in range(len(CLASSES)):
        plot_roc_auc_classifier(depth, fpr, tpr, roc_auc, ix=i, classes=CLASSES)
df_result_cv_scores = pd.DataFrame(result_cv_scores, columns=['depth', 'score', 'profiler'])



In [41]:

    
# df_result_cv_scores['mean'] = df_result_cv_scores.values[:,1:].mean(axis=1)
# df_result_cv_scores = df_result_cv_scores.sort_values('mean')
df_result_cv_scores.to_csv('../figures/karlsson2013_roc_cv_table.txt')
g = sns.factorplot(x="depth", y="score", kind="box", data=df_result_cv_scores, palette=COLORS, legend=False)
g.set_ylabels("ROC AUC").set_xlabels("Depth (log10)")
plt.title("Classification Accuracy by Depth")
pltname = "roc_auc"
save_plot(g, pltname)



In [26]:

    
def plot_pca(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
    n_classes = len(classes)
    fulldepth_df = df_depths['fulldepth']
    pca = PCA(n_components=2)
    pca.fit(fulldepth_df)

    # Percentage of variance explained for each components
    print('explained variance ratio (first two components): %s'
          % str(pca.explained_variance_ratio_))
    
    for depth, df in df_depths.items():
        X_r = pca.transform(df)
        fig = plt.figure()
        lw = 2
        
        
        for color, target_name in zip(colors, classes):
            plt.scatter(X_r[y == target_name, 0], X_r[y == target_name, 1], color=color, alpha=.8, lw=lw,
                        label=target_name)
        plt.legend(loc='best', shadow=False, scatterpoints=1)
        if depth != 'fulldepth':
            plt.title('PCA of Karlsson dataset depth=.%s' % str(depth))
        else:
            plt.title('PCA of Karlsson dataset')
        save_plot(fig, 'pca_{}_nclasses{}'.format(depth, str(n_classes)))



In [27]:

    
df_depths2 = {}
for depth, df in df_depths.items():
    temp_y = join_df["Classification "].values
    mask = temp_y != 'IGT'
    df = df[mask]
    temp_y = temp_y[mask]
    df_depths2[depth] = df  

plot_pca(df_depths2, temp_y)
plot_pca(df_depths, join_df["Classification "].values)
plot_pca(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])









    



explained variance ratio (first two components): [ 0.11548391  0.07899261]
explained variance ratio (first two components): [ 0.10989399  0.07376953]
explained variance ratio (first two components): [ 0.10989399  0.07376953]



In [28]:

    
def plot_lda(df_depths, y, colors=COLORS, classes=['NGT', 'T2D']):
    fulldepth_df = df_depths['fulldepth']
    n_classes = len(classes)
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_r2 = lda.fit(fulldepth_df, y).transform(fulldepth_df)
    
    lw = 2
    
    for depth, df in df_depths.items():
        X_r2 = lda.transform(df)
        fig = plt.figure()
        for color, target_name in zip(colors, classes):
            plt.scatter(X_r2[y == target_name, 0], X_r2[y == target_name, 1], alpha=.8, color=color, lw=lw, label=target_name)
        plt.legend(loc='best', shadow=False, scatterpoints=1)
        if depth != 'fulldepth':
            plt.title('LDA of Karlsson dataset depth=.%s' % str(depth))
        else:
            plt.title('LDA of Karlsson dataset')
        save_plot(fig, 'lda_{}'.format(depth))



In [29]:

    
plot_lda(df_depths, join_df["Classification "].values, classes=['IGT', 'NGT', 'T2D'])









    



/export/scratch/miniconda3/envs/analysis_SHOGUN/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")



In [30]:

    
# Spearman Correlation
from scipy.stats import pearsonr

def pearson_correlations(df_depths):
    pearson_correlations = []
    fulldepth_df = df_depths['fulldepth']
    for depth, df in df_depths.items():
        if depth != 'fulldepth':
            for i, row in fulldepth_df.iterrows():
                pearson_correlations.append([np.log10(float("." + str(depth))), pearsonr(row, df.loc[i])[0], 'burst'])
    pearson_correlations.append([0, 1.0, 'burst'])
    return pd.DataFrame(pearson_correlations, columns=['depth', 'correlation', 'profiler'])



In [31]:

    
pearson_df = pearson_correlations(df_depths)



In [32]:

    
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=pearson_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels(" Correlation").set_xlabels("Depth (log10)")
plt.title("Karlsson Pearson Correlation with Full Depth")
pltname = "pearson_correlation"
save_plot(g, pltname)



In [33]:

    
# Spearman Correlation
from scipy.stats import spearmanr

def spearman_correlations(df_depths):
    spearman_correlations = []
    fulldepth_df = df_depths['fulldepth']
    for depth, df in df_depths.items():
        if depth != 'fulldepth':
            for i, row in fulldepth_df.iterrows():
                spearman_correlations.append([np.log10(float("." + str(depth))), spearmanr(row, df.loc[i])[0], 'burst'])
    spearman_correlations.append([0, 1.0, 'burst'])
    return pd.DataFrame(spearman_correlations, columns=['depth', 'correlation', 'profiler'])



In [34]:

    
spearman_df = spearman_correlations(df_depths)
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=spearman_df, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Spearman Correlation").set_xlabels("Depth (log10)")
plt.title("Spearman Correlation with Full Depth")
pltname = "spearman_correlation"
save_plot(g, pltname)



In [35]:

    
from skbio.stats.composition import ancom, clr_inv

results = []
for depth, df in df_depths.items():
    if CLR:
        data = clr_inv(df)
        df = pd.DataFrame(data, index=df.index, columns=df.columns)
    results.append([depth, ancom(df, join_df["Classification "])])



In [36]:

    
g = sns.factorplot(x="depth", y="correlation", hue="profiler", data=df_dice, palette=COLORS, capsize=.2, size=6, aspect=.75, alpha=.1, legend=False)
g.set_ylabels("Dice Coefficients").set_xlabels("Depth (log10)")
plt.title("Dice Coefficients with Full Depth")
pltname = "dice_coefficients"
save_plot(g, pltname)



In [37]:

    
for result in results:
    print(result[1][0]['Reject null hypothesis'].sum())

	51	53	54	58	59	60	77	80	88	92	...	604	614	617	618	628	638	640	643	646	655
#OTU ID
k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii	0.000036	0.015895	0.059891	0.016667	0.045543	0.002146	2.179714e-02	0.019733	0.067902	8.975381e-06	...	3.469546e-07	0.000002	2.265747e-03	1.028675e-06	0.014766	0.000008	2.425175e-06	0.043161	0.000006	0.000067
k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3	0.000654	0.003896	0.036820	0.008164	0.007381	0.008369	1.929783e-02	0.001721	0.000759	1.935886e-02	...	1.209484e-03	0.012294	1.188774e-02	1.176701e-02	0.019588	0.002573	5.422553e-02	0.002703	0.000245	0.005952
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis	0.012508	0.036187	0.008047	0.003052	0.012644	0.013321	1.409705e-05	0.041603	0.001616	8.284967e-06	...	4.545106e-05	0.000713	3.805072e-05	2.029061e-02	0.010492	0.000357	1.455105e-05	0.004639	0.000005	0.156644
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum	0.000067	0.000271	0.000079	0.000709	0.000111	0.000221	6.876611e-07	0.000522	0.000029	6.904139e-06	...	3.851196e-05	0.000225	1.833353e-05	1.769321e-04	0.000084	0.000007	2.251948e-07	0.000036	0.000002	0.000814
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis	0.000002	0.000013	0.000003	0.001868	0.000004	0.000042	1.719153e-06	0.000003	0.001370	3.452070e-07	...	3.469546e-07	0.000002	2.248451e-07	2.228795e-07	0.000005	0.000008	6.236164e-06	0.000036	0.000003	0.005782

#OTU ID	k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__Methanobrevibacter_smithii	k__Bacteria;p__;c__;o__;f__;g__;s__bacterium_LF-3	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_adolescentis	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_angulatum	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_animalis	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_bifidum	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_breve	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_dentium	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium_longum	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Cellulomonadaceae;g__Cellulomonas;s__Cellulomonas_carbonis	...	k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__Desulfovibrio;s__Desulfovibrio_piger	k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Citrobacter;s__Citrobacter_freundii	k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli	k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella_pneumoniae	k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_dysenteriae	k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Shigella;s__Shigella_flexneri	k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_muciniphila	k__BacteriaPlasmid;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides_thetaiotaomicron	k__BacteriaPlasmid;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Eubacterium;s__[Eubacterium]_eligens	k__BacteriaPlasmid;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
51	-0.385302	-0.385302	1.431775	-0.385302	-0.385302	-0.385302	-0.385302	-0.385302	0.045481	-0.385302	...	-0.385302	-0.385302	0.045481	-0.385302	-0.385302	-0.385302	1.654919	-0.385302	-0.385302	-0.385302
53	1.082507	-0.446888	2.381790	-0.446888	-0.446888	-0.446888	-0.446888	-0.016105	-0.446888	-0.446888	...	-0.446888	-0.446888	-0.446888	-0.446888	-0.446888	-0.446888	3.351190	-0.446888	0.677042	-0.446888
54	3.213817	2.520670	1.422058	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	1.134375	-0.395020	...	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020	-0.395020
58	1.908160	1.060862	-0.037750	-0.468533	0.655397	-0.468533	-0.037750	-0.468533	1.348544	-0.468533	...	-0.468533	-0.468533	-0.037750	-0.468533	-0.468533	-0.468533	1.348544	-0.468533	-0.468533	-0.468533
59	1.874714	1.027417	1.027417	-0.501979	-0.501979	-0.501979	-0.501979	-0.501979	0.621951	-0.501979	...	-0.501979	0.621951	0.621951	-0.501979	-0.501979	-0.501979	-0.501979	-0.501979	0.621951	-0.501979