notebook.community

Edit and run



In [66]:

    
%matplotlib inline

import pandas as pd
from skbio.stats.composition import clr, closure
from matplotlib.colors import ListedColormap
from numpy.random import permutation
import numpy as np
from matplotlib.ticker import FormatStrFormatter
import os

from multiprocessing import Pool, cpu_count
import seaborn as sns
import matplotlib.pyplot as plt

def save_plot(fig, pltname, artists=(), size_inches=(4,4)):
    fig.set_size_inches(size_inches)
    fig.savefig(os.path.join("figures", "comparison_" + pltname + ".png"), dpi=350, bbox_extra_artists=artists, bbox_inches='tight', transparent=False)
    
def custom_legend(fig, colors, labels, legend_location="center left", legend_boundary = (1,.5), marker="o"):
    # Create custom legend for colors"
    patches = [ plt.plot([],[], marker=marker, ms=10, ls="", mec=None, color=colors[i], label="{:s}".format(labels[i]) )[0]  for i in range(len(labels)) ]
    artist = fig.legend(patches, labels, loc=legend_location, bbox_to_anchor=legend_boundary, fontsize=8)
    return artist

def multiplicative_replacement2(mat, delta=None):
    r"""Replace all zeros with small non-zero values
    It uses the multiplicative replacement strategy [1]_ ,
    replacing zeros with a small positive :math:`\delta`
    and ensuring that the compositions still add up to 1.
    Parameters
    ----------
    mat: array_like
       a matrix of proportions where
       rows = compositions and
       columns = components
    delta: float, optional
       a small number to be used to replace zeros
       If delta is not specified, then the default delta is
       :math:`\delta = \frac{1}{N^2}` where :math:`N`
       is the number of components
    Returns
    -------
    numpy.ndarray, np.float64
       A matrix of proportions where all of the values
       are nonzero and each composition (row) adds up to 1
    Raises
    ------
    ValueError
       Raises an error if negative proportions are created due to a large
       `delta`.
    Notes
    -----
    This method will result in negative proportions if a large delta is chosen.
    References
    ----------
    .. [1] J. A. Martin-Fernandez. "Dealing With Zeros and Missing Values in
           Compositional Data Sets Using Nonparametric Imputation"
    Examples
    --------
    >>> import numpy as np
    >>> from skbio.stats.composition import multiplicative_replacement
    >>> X = np.array([[.2,.4,.4, 0],[0,.5,.5,0]])
    >>> multiplicative_replacement(X)
    array([[ 0.1875,  0.375 ,  0.375 ,  0.0625],
           [ 0.0625,  0.4375,  0.4375,  0.0625]])
    """
    mat = np.array(mat)
    z_mat = (mat == 0)

    num_feats = mat.shape[-1]
    tot = np.sum(z_mat, axis=-1, keepdims=True)

    if delta is None:
        delta = np.min((np.min(mat[mat > 0])*.55, (1. / num_feats)**2))

    zcnts = 1 - ((tot * delta)/mat.sum(axis=-1, keepdims=True))
    if np.any(zcnts) < 0:
        raise ValueError('The multiplicative replacment created negative '
                         'proportions. Consider using a smaller `delta`.')
    mat = np.where(z_mat, delta, zcnts * mat)
    return mat.squeeze()



In [2]:

    
df_shotgun = pd.read_csv("data/taxatable.burst.shotgun.txt", sep="\t", index_col=0)
df_16s = pd.read_csv("data/taxatable.burst.16S.txt", sep="\t", index_col=0)



In [3]:

    
df_shotgun.shape









    Out[3]:





(5277, 180)



In [4]:

    
df_16s.shape









    Out[4]:





(728, 180)



In [5]:

    
# Preprocess columns for SRS IDs
df_shotgun.columns = map(lambda x: x.split(".")[0], df_shotgun.columns)



In [6]:

    
# Verify all SRS IDs exist in both DFs
print(len(set(df_shotgun.columns).intersection(df_16s.columns)) == df_16s.shape[1])









    



True



In [7]:

    
def stylize_axes(ax):
    """Customize axes spines, title, labels, ticks, and ticklabels."""
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.xaxis.set_tick_params(top='off', direction='out', width=1)
    ax.yaxis.set_tick_params(right='off', direction='out', width=1)



In [8]:

    
def subsample(counts, n):
    """Subsamples new vector from vector of orig items.
    
    Returns all items if requested sample is larger than number of items.
    """
    choices = np.random.choice(np.arange(counts.shape[0]), n, replace=True, p=counts/counts.sum())
    return np.bincount(choices, minlength=counts.shape[0])



In [9]:

    
df_shotgun = df_shotgun.loc[:, df_shotgun.sum() > 100000]

df_shotgun = df_shotgun.apply(lambda x: subsample(x, 100000), axis=0)



In [10]:

    
# Get the none intersecting organisms
missing_strains = set(df_shotgun.index).intersection(df_16s.index).symmetric_difference(df_16s.index)



In [11]:

    
# Filter for a specific level
def filter_df_to_taxalevel(df, level):
    if level < 8:
        df['summary_taxa'] = [';'.join(str(_).split(';')[:level]) if str(_).count(';') >= level else False for _ in df.index]
        df = df[df['summary_taxa'] != False]
        return df.groupby('summary_taxa').sum()
    else:
        return df.loc[[';t__' in str(_) for _ in df.index], :]



In [12]:

    
# Filter for a specific level
def cleanup_taxatable(df):
    df['summary_taxa'] = [';'.join(str(_).split(';')[:7]) if str(_).count(';') >= 1 else False for _ in df.index]
    df = df[df['summary_taxa'] != False]
    return df.groupby('summary_taxa').sum()



In [13]:

    
def run_experiment(df_16s, df_shotgun, series_hmp_bodysite, level, only_bacteria=False):
    df_16s_species = filter_df_to_taxalevel(df_16s, level)
    df_shotgun_species = filter_df_to_taxalevel(df_shotgun, level)
    
    if only_bacteria:
        # Filter to only bacteria kingdom
        df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]
    
    df_16s_species = df_16s_species.T.join(series_hmp_bodysite, how="inner")
    
    # 16s summarize at level
    list_dfs = []
    for group, df in df_16s_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_16s = pd.concat(list_dfs)
    df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
    
    df_shotgun_species = df_shotgun_species.T.join(series_hmp_bodysite)
    
    # Shtogun summarize at level
    list_dfs = []
    for group, df in df_shotgun_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_shotgun = pd.concat(list_dfs)
    df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
    
    df_results = pd.DataFrame(yield_comparision_stats(df_clr_shotgun, df_clr_16s), columns=('srs', 'pearson', 'spearman', 'num_features_shotgun', 'num_features_16s', 'difference_shotgun', 'difference_16s', 'jaccard'))
    df_results = df_results.set_index('srs')
    df_results['bodytype'] = series_hmp_bodysite
    return df_results



In [14]:

    
def permutation_test(df_16s, df_shotgun, series_hmp_bodysite, level, num_perms=1, only_bacteria=False):
    df_16s_species = filter_df_to_taxalevel(df_16s, level)
    df_shotgun_species = filter_df_to_taxalevel(df_shotgun, level)
    
    if only_bacteria:
        # Filter to only bacteria kingdom
        df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]
    
    df_16s_species = df_16s_species.T.join(series_hmp_bodysite, how="inner")
    
    # 16s summarize at level
    list_dfs = []
    for group, df in df_16s_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_16s = pd.concat(list_dfs)
    df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
    
    df_shotgun_species = df_shotgun_species.T.join(series_hmp_bodysite)
    
    # Shtogun summarize at level
    list_dfs = []
    for group, df in df_shotgun_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_shotgun = pd.concat(list_dfs)
    df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
    
    s = 0
    for group, df in df_shotgun_species.groupby('bodysite'):
        for ind1, ind2 in zip(df.index, df.index):
            arr1 = df_clr_shotgun.loc[ind1, :].dropna()
            arr2 = df_clr_16s.loc[ind2, :].dropna()
            s += arr1.corr(arr2)*arr1.corr(arr2)
    yield s
    
    for i in range(num_perms-1):
        s = 0
        for group, df in df_shotgun_species.groupby('bodysite'):
            for ind1, ind2 in zip(df.index, permutation(list(df.index))):
                arr1 = df_clr_shotgun.loc[ind1, :].dropna()
                arr2 = df_clr_16s.loc[ind2, :].dropna()
                s += arr1.corr(arr2)*arr1.corr(arr2)
        yield s



In [15]:

    
df_hmp_map = pd.read_csv("data/HMP_map.txt", sep='\t', index_col="SRS")



In [16]:

    
def replace_multi(string, replacement, string_to_replace):
    for s in string_to_replace:
        string = string.replace(s, replacement)
    return string



In [17]:

    
# Remove duplicates
df_hmp_map['bodysite'] = list(map(lambda x: replace_multi(x, '', ['Left_', 'Right_']), df_hmp_map["HMPBODYSUBSITE"]))
series_hmp_bodysite = df_hmp_map["bodysite"]
series_hmp_bodysite = series_hmp_bodysite[~series_hmp_bodysite.index.duplicated()]



In [18]:

    
def weighted_jaccard(x, y):
    z = np.vstack((x,y))
    return np.sum(z.min(axis=0)/z.max(axis=0))



In [19]:

    
# Filter to df shotgun
df_shotgun_clean = cleanup_taxatable(df_shotgun)
df_16s_clean = cleanup_taxatable(df_16s)

# Join up mapping ids
df_shotgun_clean = df_shotgun_clean.T.join(series_hmp_bodysite, how="inner")
df_16s_clean = df_16s_clean.T.join(series_hmp_bodysite, how="inner")



In [20]:

    
def yield_comparision_stats(df_clr_shotgun, df_clr_16s):
    #('srs', 'pearson', 'spearman', 'num_features_shotgun', 'num_features_16s', 'difference_shotgun', 'difference_16s', 'jaccard', 'w_jaccard')
    for ind in df_clr_shotgun.index:
        arr1 = df_clr_shotgun.loc[ind, :].dropna()
        arr2 = df_clr_16s.loc[ind, :].dropna()
        set_arr1 = set(arr1.index)
        set_arr2 = set(arr2.index)
#         if len(set_arr2.difference(set_arr1)) > 0:
#             print(set_arr2.difference(set_arr1))
        #w_jaccard = weighted_jaccard(arr1.values, arr2[arr1].values)
        yield (ind, arr1.corr(arr2), arr1.corr(arr2, method='spearman'), len(set_arr1), len(set_arr2), len(set_arr1.difference(set_arr2)), len(set_arr2.difference(set_arr1)), len(set_arr1.intersection(set_arr2))/len(set_arr1.union(set_arr2)))



In [21]:

    
for l in range(2, 8):
    df_comparison = run_experiment(df_16s, df_shotgun, series_hmp_bodysite, l, only_bacteria=True)
    df_comparison.to_csv("results/levels.bacteria.%d.csv" % l)



In [22]:

    
for l in range(1, 8):
    df_comparison = run_experiment(df_16s, df_shotgun, series_hmp_bodysite, l, only_bacteria=False)
    df_comparison.to_csv("results/levels.%d.csv" % l)









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:2995: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar)
/home/bhillmann/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:2929: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)



In [23]:

    
# Run 1,000 iterations of the permuation test
perm_results = np.array([i for i in permutation_test(df_16s, df_shotgun, series_hmp_bodysite, l, num_perms=10, only_bacteria=False)])



In [24]:

    
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig, ax = plt.subplots()

sns.distplot(perm_results, ax=ax)
plt.axvline(x=perm_results[0], color='red')
ax.set_ylabel('frequency')
ax.set_xlabel('permuted sum of R-squared')
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax.annotate("p = {:.3f}".format(1-(perm_results < perm_results[0]).sum()/perm_results.shape[0]), xy=(.6, .9), xycoords=ax.transAxes)

sns.despine(fig)

save_plot(fig, 'pairingmatters_hist')









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "



In [25]:

    
# Filter to df shotgun
df_shotgun_bact = df_shotgun.loc[['k__Bacteria;' in str(_) for _ in df_shotgun.index], :]
df_shotgun_bact = cleanup_taxatable(df_shotgun)
df_16s_clean = cleanup_taxatable(df_16s)

# Join up mapping ids
df_shotgun_bact = df_shotgun_bact.T.join(series_hmp_bodysite, how="inner")
df_16s_clean = df_16s_clean.T.join(series_hmp_bodysite, how="inner")



In [26]:

    
from collections import defaultdict
shotgun_counter = {}
for group, df in df_shotgun_bact.groupby('bodysite'):
    group_counter = defaultdict(int)
    for c in df.columns:
        if c != 'bodysite':
            group_counter[c.count(';')] += df[c].sum()
    shotgun_counter[group] = group_counter



In [27]:

    
df_shotgun_levels = pd.DataFrame(shotgun_counter)
df_shotgun_levels = df_shotgun_levels/df_shotgun_levels.sum()



In [28]:

    
df_shotgun_levels









    Out[28]:







  
    
      
      Anterior_nares
      Buccal_mucosa
      Posterior_fornix
      Retroauricular_crease
      Stool
      Supragingival_plaque
      Tongue_dorsum
    
  
  
    
      1
      0.000751
      0.007118
      0.002868
      0.000215
      0.002998
      0.001614
      0.006157
    
    
      2
      0.001459
      0.003413
      0.000153
      0.001242
      0.000844
      0.002119
      0.004476
    
    
      3
      0.000831
      0.007302
      0.000796
      0.000429
      0.051975
      0.002121
      0.005674
    
    
      4
      0.004011
      0.006071
      0.000631
      0.002011
      0.009793
      0.004339
      0.006658
    
    
      5
      0.025945
      0.098925
      0.037694
      0.026196
      0.034289
      0.022963
      0.049930
    
    
      6
      0.967003
      0.877172
      0.957858
      0.969907
      0.900101
      0.966843
      0.927106



In [29]:

    
counter_16s = {}
for group, df in df_16s_clean.groupby('bodysite'):
    group_counter = defaultdict(int)
    for c in df.columns:
        if c != 'bodysite':
            group_counter[str(c.count(';'))] += df[c].sum()
    counter_16s[group] = group_counter



In [30]:

    
df_16s_levels = pd.DataFrame(counter_16s)
df_16s_levels = df_16s_levels/df_16s_levels.sum()



In [31]:

    
df_16s_levels









    Out[31]:







  
    
      
      Anterior_nares
      Buccal_mucosa
      Posterior_fornix
      Retroauricular_crease
      Stool
      Supragingival_plaque
      Tongue_dorsum
    
  
  
    
      1
      0.000027
      0.000000
      0.000000
      0.000000
      0.000034
      0.000007
      0.000000
    
    
      2
      0.000254
      0.001782
      0.000000
      0.000137
      0.000364
      0.000322
      0.022860
    
    
      3
      0.004707
      0.005229
      0.000000
      0.002302
      0.000661
      0.000816
      0.001975
    
    
      4
      0.000145
      0.001453
      0.000025
      0.000069
      0.001285
      0.002212
      0.000672
    
    
      5
      0.241745
      0.215958
      0.501423
      0.745960
      0.032385
      0.038234
      0.087670
    
    
      6
      0.753121
      0.775578
      0.498552
      0.251532
      0.965272
      0.958408
      0.886822



In [32]:

    
fig, ax = plt.subplots()
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue'])

df_16s_levels.index =  ["Phylum", "Class", "Order", "Family", "Genus", "Species"]

# Create stacked barplots
df_16s_levels.T.plot.barh(stacked=True, ax=ax, colormap=ListedColormap(palette), legend=True)

save_plot(fig, '16s_levels_annotated')



In [33]:

    
fig, ax = plt.subplots()
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue'])

df_shotgun_levels.index =  ["Phylum", "Class", "Order", "Family", "Genus", "Species"]

# Create stacked barplots
df_shotgun_levels.T.plot.barh(stacked=True, ax=ax, colormap=ListedColormap(palette), legend=False)

save_plot(fig, 'shotgun_levels_annotated')



In [34]:

    
filter_df_to_taxalevel(df_shotgun_bact.T, 5).sum(axis=1).sort_values(ascending=False).head()
#df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]









    Out[34]:





summary_taxa
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae                    3890423
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae                       1631089
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae                    1557696
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae    1235501
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae                 590526
dtype: int64



In [35]:

    
x = 'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae'
x = 'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae'
'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae'









    Out[35]:





'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae'



In [36]:

    
set_shotgun = set(df_shotgun_bact.columns)
set_df_16s = set(df_16s_clean.columns)

# for col in set_shotgun.difference(set_df_16s):
#     df_16s_clean[col] = 0

for col in set_df_16s.difference(set_shotgun):
    df_shotgun_bact[col] = 0
    
df_shotgun_bact = df_shotgun_bact[list(set_df_16s)]



In [37]:

    
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue', 'green', 'maroon', 'orchid', 'deep pink', 'light grey blue', 'old rose', 'light pink', 'pale aqua', 'yellow', 'lime green', 'dark orange', 'brown'])
color_map = ListedColormap(palette)



In [38]:

    
fig, ax = plt.subplots()

df = df_shotgun_bact.T.loc[[x in _ for _ in df_shotgun_bact.T.index], :].T.head(25)
df += 1
df = (df.T/df.T.sum()).T
df.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax)

save_plot(fig, 'shotgun_prevotella')



In [39]:

    
fig, ax = plt.subplots()

df = df_16s_clean.T.loc[[x in _ for _ in df_16s_clean.T.index], :].T.head(25)
df += 1
df = (df.T/df.T.sum()).T
df.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax, figsize=(12,12))
#fig1 = custom_legend(fig, color_map, df.columns)
#save_plot(fig, '16s_prevotella', artists=(fig1,))
save_plot(fig, '16s_prevotella')



In [40]:

    
#df_x = df_16s_clean.T.loc[[x in _ for _ in df_16s_clean.T.index], :].T
df_x = df_16s_clean.T.loc[['Bacteria' in _ for _ in df_16s_clean.T.index], :].T
df_x = df_x.reindex(sorted(df_x.columns), axis=1)
df_x += 1
df_x = (df_x.T/df_x.T.sum()).T
#df_y = df_shotgun_bact.T.loc[[x in _ for _ in df_shotgun_bact.T.index], :].T
df_y = df_shotgun_bact.T.loc[['Bacteria' in _ for _ in df_shotgun_bact.T.index], :].T
df_y = df_y.reindex(sorted(df_y.columns), axis=1)
df_y += 1
df_y = (df_y.T/df_y.T.sum()).T


fig, ax = plt.subplots()

for i in df_x.index:
    if i in df_x.index and i in df_y.index:
        plt.plot(df_x.loc[i, :], df_y.loc[i, :], 'r+')



In [41]:

    
list_16s_dfs = []
list_shotgun_dfs = []
dict_missing_shotgun = defaultdict(list)
dict_missing_16s = defaultdict(list)
for (group_16, df_16), (group_shotgun, df_shotgun) in zip(df_16s_clean.groupby('bodysite'), df_shotgun_bact.groupby('bodysite')):
    # Run the prevalence filter
    df_16 = df_16.drop("bodysite", axis=1)
    df_16 = df_16.loc[:, (df_16 > 0).sum(axis=0)/df_16.shape[0] > .1]

    # Run the prevlance filter
    df_shotgun = df_shotgun.drop("bodysite", axis=1)
    df_shotgun = df_shotgun.loc[:, (df_shotgun > 0).sum(axis=0)/df_shotgun.shape[0] > .1]
    
    # Reindex both dataframes according to missing columns
    set_shotgun = set(df_shotgun.columns)
    set_df_16s = set(df_16.columns)

    for col in set_df_16s.difference(set_shotgun):
        df_shotgun[col] = 0
        for index in df_shotgun.index:
            dict_missing_shotgun[index].append(col)
    
    for col in set_shotgun.difference(set_df_16s):
        df_16[col] = 0
        for index in df_16.index:
            dict_missing_16s[index].append(col)

    #df_shotgun = df_shotgun[list(set_df_16s)]
    #df_16s = df_16s[list(set_shotgun)]
    
#    df_16 += .001
#    d
    mat_16 = multiplicative_replacement2(df_16, delta=.2)
    df_16 = pd.DataFrame(clr(mat_16), index=df_16.index, columns=df_16.columns)
    list_16s_dfs.append(df_16.copy())
#    df_shotgun += .001
    mat_shotgun = multiplicative_replacement2(df_shotgun, delta=.2)
#    df_shotgun = (df_shotgun.T/df_shotgun.T.sum()).T
    df_shotgun = pd.DataFrame(clr(mat_shotgun), index=df_shotgun.index, columns=df_shotgun.columns)
    list_shotgun_dfs.append(df_shotgun.copy())

df_clr_shotgun = pd.concat(list_shotgun_dfs)
df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
df_clr_16s = pd.concat(list_16s_dfs)
df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
df_clr_shotgun= df_clr_shotgun.reindex(sorted(df_clr_shotgun.columns), axis=1)
df_clr_16s= df_clr_16s.reindex(sorted(df_clr_16s.columns), axis=1)



In [42]:

    
df_colors = pd.DataFrame(np.zeros(df_clr_shotgun.shape), index=df_clr_shotgun.index, columns=df_clr_shotgun.columns)

for k, vs in dict_missing_shotgun.items():
    for v in vs:
        df_colors.loc[k, v] = 1
    
for k, vs in dict_missing_16s.items():
    for v in vs:
        df_colors.loc[k, v] = 2



In [43]:

    
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig, ax = plt.subplots()
    
palette = sns.color_palette("colorblind", 3)

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

leg1 = custom_legend(ax, palette, ["Both", "16S Only", "Shotgun Only"])
ax.set_ylabel("16S Abundance")
ax.set_xlabel("Shotgun Abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance", artists=(leg1,))



In [44]:

    
list_16s_dfs = []
list_shotgun_dfs = []
dict_missing_shotgun = defaultdict(list)
dict_missing_16s = defaultdict(list)
for (group_16, df_16), (group_shotgun, df_shotgun) in zip(df_16s_clean.groupby('bodysite'), df_shotgun_clean.groupby('bodysite')):
    # Run the prevalence filter
    df_16 = df_16.drop("bodysite", axis=1)
    df_16 = df_16.loc[:, (df_16 > 0).sum(axis=0)/df_16.shape[0] > .1]

    # Run the prevlance filter
    df_shotgun = df_shotgun.drop("bodysite", axis=1)
    df_shotgun = df_shotgun.loc[:, (df_shotgun > 0).sum(axis=0)/df_shotgun.shape[0] > .1]
    
    # Reindex both dataframes according to missing columns
    set_shotgun = set(df_shotgun.columns)
    set_df_16s = set(df_16.columns)

    for col in set_df_16s.difference(set_shotgun):
        df_shotgun[col] = 0
        for index in df_shotgun.index:
            dict_missing_shotgun[index].append(col)
    
    for col in set_shotgun.difference(set_df_16s):
        df_16[col] = 0
        for index in df_16.index:
            dict_missing_16s[index].append(col)

    #df_shotgun = df_shotgun[list(set_df_16s)]
    #df_16s = df_16s[list(set_shotgun)]
    
    df_16 += .001
    df_16 = (df_16.T/df_16.T.sum()).T
#     df_16 = pd.DataFrame(clr(multiplicative_replacement2(df_16, delta=.2)), index=df_16.index, columns=df_16.columns)
    list_16s_dfs.append(df_16.copy())
    df_shotgun += .001
    df_shotgun = (df_shotgun.T/df_shotgun.T.sum()).T
#     df_shotgun = pd.DataFrame(clr(multiplicative_replacement2(df_shotgun, delta=.2)), index=df_shotgun.index, columns=df_shotgun.columns)
    list_shotgun_dfs.append(df_shotgun.copy())

df_clr_shotgun = pd.concat(list_shotgun_dfs)
df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
df_clr_16s = pd.concat(list_16s_dfs)
df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
df_clr_shotgun= df_clr_shotgun.reindex(sorted(df_clr_shotgun.columns), axis=1)
df_clr_16s= df_clr_16s.reindex(sorted(df_clr_16s.columns), axis=1)



In [45]:

    
df_colors = pd.DataFrame(np.zeros(df_clr_shotgun.shape), index=df_clr_shotgun.index, columns=df_clr_shotgun.columns)

for k, vs in dict_missing_shotgun.items():
    for v in vs:
        df_colors.loc[k, v] = 1
    
for k, vs in dict_missing_16s.items():
    for v in vs:
        if 'k__Bacteria' in v:
            df_colors.loc[k, v] = 2    
        else:
            df_colors.loc[k, v] = 3



In [46]:

    
# Join up mapping ids
dfgroup_clr_shotgun = df_clr_shotgun.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()
dfgroup_clr_16s = df_clr_16s.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()



In [47]:

    
def yield_summaries(df, df_colors):
    #['shotgun_both','shotgun_shotgun','shotgun_16s_bacteria','shotgun_16s','16s_both','16s_shotgun','16s_16s_bacteria','16s_16s']
    for i in df_clr_shotgun.index:
        if i in df_clr_shotgun.index and i in df_clr_16s.index:
            yield [df.loc[i, :][df_colors.loc[i, :] == 0].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 1].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 2].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 3].dropna().sum()]



In [48]:

    
df_prop_shotgun = pd.DataFrame(yield_summaries(df_clr_shotgun, df_colors), columns=['both','16s only','shotgun bacteria only','shotgun non-bacteria only'], index=df_clr_shotgun.index)
df_prop_16s = pd.DataFrame(yield_summaries(df_clr_16s, df_colors), columns=['both','16s only','shotgun bacteria only','shotgun non-bacteria only'], index=df_clr_shotgun.index)



In [49]:

    
df_grouped_shotgun = df_prop_shotgun.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()
df_grouped_16s = df_prop_16s.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()



In [51]:

    
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)


# draw vertical line from (70,100) to (70, 250)
#ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
#ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)


leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance")



In [74]:

    
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        
ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)


# draw vertical line from (70,100) to (70, 250)
#ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
#ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)


#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shallow shotgun only"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance_no_purple", size_inches=(4,4))



In [58]:

    
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, .11, .02))
ax.yaxis.set_ticks(np.arange(0, .11, .02))
ax.set_ylim(-0.01, .11)
ax.set_xlim(-0.01, .11)
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

save_plot(fig, "inset_scatterplot_abundance")



In [59]:

    
fig, ax = plt.subplots()

color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax, colormap=color_map)
ax.set_xlabel("Mean abundance")
ax.yaxis.tick_right()

stylize_axes(ax)

save_plot(fig, "stackedhbar_abundances")









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [60]:

    
df_grouped_16s.plot.barh(stacked=True, legend=False)









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fbb5de51908>



In [61]:

    
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(5.5,2))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")
ax2.set_yticklabels("")

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
        
for p in ax2.patches:
    if p.get_width() >= .08:
        ax2.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
    

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])

save_plot(fig, "stacked_hbar_both_abundance", size_inches=(11,4), artists=(leg1,))









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [71]:

    
color_map_no_purple = ListedColormap(palette[:3] + [palette[2]])

with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(4,4))
    ax1 = plt.subplot(111)
    
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

#stylize_axes(ax1)

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=8, textcoords='offset points', horizontalalignment='center')    

sns.despine(offset=10, trim=True)
        
save_plot(fig, "stacked_hbar_both_abundance_no_purple", size_inches=(4, 4))



In [ ]:

    
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(5.5,2))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")
ax2.set_yticklabels("")

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
        
for p in ax2.patches:
    if p.get_width() >= .08:
        ax2.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
    

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])

save_plot(fig, "stacked_hbar_both_abundance", size_inches=(11,4), artists=(leg1,))



In [55]:

    
fig, ax = plt.subplots()

color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax, colormap=color_map)
ax.set_xlabel("Mean abundance")
ax.yaxis.tick_right()

stylize_axes(ax)

save_plot(fig, "stackedhbar_abundances")









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [56]:

    
df_grouped_shotgun['datatype'] = 'shotgun'
df_grouped_16s['datatype'] = '16s'

df_grouped_prop = pd.concat((df_grouped_shotgun, df_grouped_16s))



In [57]:

    
df_grouped_prop









    Out[57]:







  
    
      
      both
      16s only
      shotgun bacteria only
      shotgun non-bacteria only
      datatype
    
  
  
    
      Anterior nares
      0.468352
      1.812641e-07
      0.520749
      0.010900
      shotgun
    
    
      Buccal mucosa
      0.870817
      4.024719e-08
      0.126397
      0.002786
      shotgun
    
    
      Posterior fornix
      0.381709
      4.007048e-08
      0.618235
      0.000056
      shotgun
    
    
      Retroauricular crease
      0.200967
      7.211483e-08
      0.795375
      0.003658
      shotgun
    
    
      Stool
      0.890525
      9.022622e-08
      0.109244
      0.000231
      shotgun
    
    
      Supragingival plaque
      0.912821
      2.002341e-08
      0.087128
      0.000051
      shotgun
    
    
      Tongue dorsum
      0.857125
      2.004793e-08
      0.142757
      0.000118
      shotgun
    
    
      Anterior_nares
      0.998972
      8.296572e-04
      0.000170
      0.000028
      16s
    
    
      Buccal_mucosa
      0.999774
      1.056980e-04
      0.000114
      0.000007
      16s
    
    
      Posterior_fornix
      0.999850
      4.410212e-05
      0.000103
      0.000003
      16s
    
    
      Retroauricular_crease
      0.999576
      3.415784e-04
      0.000069
      0.000013
      16s
    
    
      Stool
      0.998425
      1.480189e-03
      0.000094
      0.000001
      16s
    
    
      Supragingival_plaque
      0.999733
      1.954522e-04
      0.000070
      0.000002
      16s
    
    
      Tongue_dorsum
      0.999749
      1.704212e-04
      0.000078
      0.000002
      16s



In [58]:

    
df_grouped_prop.plot.barh(stacked=True, legend=True)









    Out[58]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f69f31f3710>



In [59]:

    
grid = sns.FacetGrid(df_grouped_prop, row="datatype", margin_titles=True)
def stacked_bar(**kwargs):
    print(kwargs)
    return df.plot.barh(stacked=True, legend=True)
grid.map(stacked_bar)









    



{'color': (0.12156862745098039, 0.4666666666666667, 0.7058823529411765)}
{'color': (0.12156862745098039, 0.4666666666666667, 0.7058823529411765)}






    Out[59]:





<seaborn.axisgrid.FacetGrid at 0x7f69f30d8e10>



In [60]:

    
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
ax3 = plt.subplot(223)
ax4 = plt.subplot(234)

#ax1
sns.distplot(perm_results, ax=ax1)
ax1.axvline(x=perm_results[0], color='red')
ax1.set_ylabel('frequency')
ax1.set_xlabel('permuted sum of R-squared')
ax1.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax1.annotate("p = {:.3f}".format(1-(perm_results < perm_results[0]).sum()/perm_results.shape[0]), xy=(.6, .9), xycoords=ax1.transAxes)

#ax2
color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax2 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax2, colormap=color_map)
ax2.set_xlabel("Mean abundance")

#ax3

#ax4









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "






    Out[60]:





Text(0.5,0,'Mean abundance')



In [61]:

    
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(11, 8))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax2.xaxis.set_ticks(np.arange(0, .11, .02))
ax2.yaxis.set_ticks(np.arange(0, .11, .02))
ax2.set_ylim(-0.01, .11)
ax2.set_xlim(-0.01, .11)
ax2.axis=['equal']
        
lims = [
    np.min([ax2.get_xlim(), ax2.get_ylim()]),  # min of both axes
    np.max([ax2.get_xlim(), ax2.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax2.set_aspect('equal')
ax2.set_xlim(lims)
ax2.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax2.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax2.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax1.set_ylabel("Shallow shotgun abundance")
ax1.set_xlabel("16S abundance")

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax1.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax1.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax1.axis=['equal']
        
lims = [
    np.min([ax1.get_xlim(), ax1.get_ylim()]),  # min of both axes
    np.max([ax1.get_xlim(), ax1.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax1.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax1.set_aspect('equal')
ax1.set_xlim(lims)
ax1.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax1.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax1.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

from scipy.stats import pearsonr

x = df_clr_16s.loc[df_clr_shotgun.index, :].values.flatten()
x = x[~np.isnan(x)]

y = df_clr_shotgun.values.flatten()
y = y[~np.isnan(y)]

r, p = pearsonr(x, y)
r = r*r

ax1.annotate(r"$r^2 = {:.2f}$ $p = {:.4f}$".format(r,p), xy=(.1, .9), xycoords=ax1.transAxes)
#ax1.annotate(r"$p = {:.4f}$".format(p), xy=(.4, .9), xycoords=ax1.transAxes)
stylize_axes(ax1)
stylize_axes(ax2)

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
#ax2.set_ylabel("Shallow shotgun abundance")
ax2.set_xlabel("16S abundance")
plt.tight_layout()

sns.despine(offset=10, trim=True)

#fig.text(.5, -0.1, "16S Abundance", ha='center', va='center')
save_plot(fig, "scatterplot_both_abundance", size_inches=(11,8), artists=(leg1,))









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [62]:

    
from scipy.stats import pearsonr

x = df_clr_16s.loc[df_clr_shotgun.index, :].values.flatten()
x = x[~np.isnan(x)]

y = df_clr_shotgun.values.flatten()
y = y[~np.isnan(y)]

r, p = pearsonr(x, y)
r = r*r



In [63]:

    
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(x, y)



In [64]:

    
slope, intercept, r_value, p_value









    Out[64]:





(0.49618829112230212, 0.00092021415951870243, 0.53704785635882046, 0.0)



In [65]:

    
fig = plt.figure(figsize=(12,8))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
ax3 = plt.subplot(223)
ax4 = plt.subplot(224)

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")

plt.setp(ax2.get_yticklabels(), visible=False)


for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax3.xaxis.set_ticks(np.arange(0, .11, .02))
ax3.yaxis.set_ticks(np.arange(0, .11, .02))
ax3.set_ylim(-0.01, .11)
ax3.set_xlim(-0.01, .11)
ax3.axis=['equal']
        
lims = [
    np.min([ax3.get_xlim(), ax3.get_ylim()]),  # min of both axes
    np.max([ax3.get_xlim(), ax3.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax3.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax3.set_xlim(lims)
ax3.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax3.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax3.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax3.set_ylabel("Shallow shotgun abundance")
ax3.set_xlabel("16S abundance")

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax4.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax4.yaxis.set_ticks(np.arange(0, 1.01, .2))
        
lims = [
    np.min([ax4.get_xlim(), ax4.get_ylim()]),  # min of both axes
    np.max([ax4.get_xlim(), ax4.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax4.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax4.set_aspect('equal')
ax4.set_xlim(lims)
ax4.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax4.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax4.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

stylize_axes(ax3)
stylize_axes(ax4)

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
#ax2.set_ylabel("Shallow shotgun abundance")
ax4.set_xlabel("16S abundance")
#ax1.set_aspect('equal')
#ax2.set_aspect('equal')
ax3.set_aspect('equal')
ax4.set_aspect('equal')

plt.tight_layout()

#fig.text(.5, -0.1, "16S Abundance", ha='center', va='center')
save_plot(fig, "scatterplot_barplot_both_abundance", size_inches=(12,8), artists=(leg1,))









    



/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [66]:

    
print(plt.style.available)









    



['Solarize_Light2', '_classic_test', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn', 'tableau-colorblind10']



In [ ]:

	Anterior_nares	Buccal_mucosa	Posterior_fornix	Retroauricular_crease	Stool	Supragingival_plaque	Tongue_dorsum
1	0.000751	0.007118	0.002868	0.000215	0.002998	0.001614	0.006157
2	0.001459	0.003413	0.000153	0.001242	0.000844	0.002119	0.004476
3	0.000831	0.007302	0.000796	0.000429	0.051975	0.002121	0.005674
4	0.004011	0.006071	0.000631	0.002011	0.009793	0.004339	0.006658
5	0.025945	0.098925	0.037694	0.026196	0.034289	0.022963	0.049930
6	0.967003	0.877172	0.957858	0.969907	0.900101	0.966843	0.927106

	Anterior_nares	Buccal_mucosa	Posterior_fornix	Retroauricular_crease	Stool	Supragingival_plaque	Tongue_dorsum
1	0.000027	0.000000	0.000000	0.000000	0.000034	0.000007	0.000000
2	0.000254	0.001782	0.000000	0.000137	0.000364	0.000322	0.022860
3	0.004707	0.005229	0.000000	0.002302	0.000661	0.000816	0.001975
4	0.000145	0.001453	0.000025	0.000069	0.001285	0.002212	0.000672
5	0.241745	0.215958	0.501423	0.745960	0.032385	0.038234	0.087670
6	0.753121	0.775578	0.498552	0.251532	0.965272	0.958408	0.886822

	both	16s only	shotgun bacteria only	shotgun non-bacteria only	datatype
Anterior nares	0.468352	1.812641e-07	0.520749	0.010900	shotgun
Buccal mucosa	0.870817	4.024719e-08	0.126397	0.002786	shotgun
Posterior fornix	0.381709	4.007048e-08	0.618235	0.000056	shotgun
Retroauricular crease	0.200967	7.211483e-08	0.795375	0.003658	shotgun
Stool	0.890525	9.022622e-08	0.109244	0.000231	shotgun
Supragingival plaque	0.912821	2.002341e-08	0.087128	0.000051	shotgun
Tongue dorsum	0.857125	2.004793e-08	0.142757	0.000118	shotgun
Anterior_nares	0.998972	8.296572e-04	0.000170	0.000028	16s
Buccal_mucosa	0.999774	1.056980e-04	0.000114	0.000007	16s
Posterior_fornix	0.999850	4.410212e-05	0.000103	0.000003	16s
Retroauricular_crease	0.999576	3.415784e-04	0.000069	0.000013	16s
Stool	0.998425	1.480189e-03	0.000094	0.000001	16s
Supragingival_plaque	0.999733	1.954522e-04	0.000070	0.000002	16s
Tongue_dorsum	0.999749	1.704212e-04	0.000078	0.000002	16s