In [66]:
%matplotlib inline

import pandas as pd
from skbio.stats.composition import clr, closure
from matplotlib.colors import ListedColormap
from numpy.random import permutation
import numpy as np
from matplotlib.ticker import FormatStrFormatter
import os

from multiprocessing import Pool, cpu_count
import seaborn as sns
import matplotlib.pyplot as plt

def save_plot(fig, pltname, artists=(), size_inches=(4,4)):
    fig.set_size_inches(size_inches)
    fig.savefig(os.path.join("figures", "comparison_" + pltname + ".png"), dpi=350, bbox_extra_artists=artists, bbox_inches='tight', transparent=False)
    
def custom_legend(fig, colors, labels, legend_location="center left", legend_boundary = (1,.5), marker="o"):
    # Create custom legend for colors"
    patches = [ plt.plot([],[], marker=marker, ms=10, ls="", mec=None, color=colors[i], label="{:s}".format(labels[i]) )[0]  for i in range(len(labels)) ]
    artist = fig.legend(patches, labels, loc=legend_location, bbox_to_anchor=legend_boundary, fontsize=8)
    return artist

def multiplicative_replacement2(mat, delta=None):
    r"""Replace all zeros with small non-zero values
    It uses the multiplicative replacement strategy [1]_ ,
    replacing zeros with a small positive :math:`\delta`
    and ensuring that the compositions still add up to 1.
    Parameters
    ----------
    mat: array_like
       a matrix of proportions where
       rows = compositions and
       columns = components
    delta: float, optional
       a small number to be used to replace zeros
       If delta is not specified, then the default delta is
       :math:`\delta = \frac{1}{N^2}` where :math:`N`
       is the number of components
    Returns
    -------
    numpy.ndarray, np.float64
       A matrix of proportions where all of the values
       are nonzero and each composition (row) adds up to 1
    Raises
    ------
    ValueError
       Raises an error if negative proportions are created due to a large
       `delta`.
    Notes
    -----
    This method will result in negative proportions if a large delta is chosen.
    References
    ----------
    .. [1] J. A. Martin-Fernandez. "Dealing With Zeros and Missing Values in
           Compositional Data Sets Using Nonparametric Imputation"
    Examples
    --------
    >>> import numpy as np
    >>> from skbio.stats.composition import multiplicative_replacement
    >>> X = np.array([[.2,.4,.4, 0],[0,.5,.5,0]])
    >>> multiplicative_replacement(X)
    array([[ 0.1875,  0.375 ,  0.375 ,  0.0625],
           [ 0.0625,  0.4375,  0.4375,  0.0625]])
    """
    mat = np.array(mat)
    z_mat = (mat == 0)

    num_feats = mat.shape[-1]
    tot = np.sum(z_mat, axis=-1, keepdims=True)

    if delta is None:
        delta = np.min((np.min(mat[mat > 0])*.55, (1. / num_feats)**2))

    zcnts = 1 - ((tot * delta)/mat.sum(axis=-1, keepdims=True))
    if np.any(zcnts) < 0:
        raise ValueError('The multiplicative replacment created negative '
                         'proportions. Consider using a smaller `delta`.')
    mat = np.where(z_mat, delta, zcnts * mat)
    return mat.squeeze()

In [2]:
df_shotgun = pd.read_csv("data/taxatable.burst.shotgun.txt", sep="\t", index_col=0)
df_16s = pd.read_csv("data/taxatable.burst.16S.txt", sep="\t", index_col=0)

In [3]:
df_shotgun.shape


Out[3]:
(5277, 180)

In [4]:
df_16s.shape


Out[4]:
(728, 180)

In [5]:
# Preprocess columns for SRS IDs
df_shotgun.columns = map(lambda x: x.split(".")[0], df_shotgun.columns)

In [6]:
# Verify all SRS IDs exist in both DFs
print(len(set(df_shotgun.columns).intersection(df_16s.columns)) == df_16s.shape[1])


True

In [7]:
def stylize_axes(ax):
    """Customize axes spines, title, labels, ticks, and ticklabels."""
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.xaxis.set_tick_params(top='off', direction='out', width=1)
    ax.yaxis.set_tick_params(right='off', direction='out', width=1)

In [8]:
def subsample(counts, n):
    """Subsamples new vector from vector of orig items.
    
    Returns all items if requested sample is larger than number of items.
    """
    choices = np.random.choice(np.arange(counts.shape[0]), n, replace=True, p=counts/counts.sum())
    return np.bincount(choices, minlength=counts.shape[0])

In [9]:
df_shotgun = df_shotgun.loc[:, df_shotgun.sum() > 100000]

df_shotgun = df_shotgun.apply(lambda x: subsample(x, 100000), axis=0)

In [10]:
# Get the none intersecting organisms
missing_strains = set(df_shotgun.index).intersection(df_16s.index).symmetric_difference(df_16s.index)

In [11]:
# Filter for a specific level
def filter_df_to_taxalevel(df, level):
    if level < 8:
        df['summary_taxa'] = [';'.join(str(_).split(';')[:level]) if str(_).count(';') >= level else False for _ in df.index]
        df = df[df['summary_taxa'] != False]
        return df.groupby('summary_taxa').sum()
    else:
        return df.loc[[';t__' in str(_) for _ in df.index], :]

In [12]:
# Filter for a specific level
def cleanup_taxatable(df):
    df['summary_taxa'] = [';'.join(str(_).split(';')[:7]) if str(_).count(';') >= 1 else False for _ in df.index]
    df = df[df['summary_taxa'] != False]
    return df.groupby('summary_taxa').sum()

In [13]:
def run_experiment(df_16s, df_shotgun, series_hmp_bodysite, level, only_bacteria=False):
    df_16s_species = filter_df_to_taxalevel(df_16s, level)
    df_shotgun_species = filter_df_to_taxalevel(df_shotgun, level)
    
    if only_bacteria:
        # Filter to only bacteria kingdom
        df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]
    
    df_16s_species = df_16s_species.T.join(series_hmp_bodysite, how="inner")
    
    # 16s summarize at level
    list_dfs = []
    for group, df in df_16s_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_16s = pd.concat(list_dfs)
    df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
    
    df_shotgun_species = df_shotgun_species.T.join(series_hmp_bodysite)
    
    # Shtogun summarize at level
    list_dfs = []
    for group, df in df_shotgun_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_shotgun = pd.concat(list_dfs)
    df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
    
    df_results = pd.DataFrame(yield_comparision_stats(df_clr_shotgun, df_clr_16s), columns=('srs', 'pearson', 'spearman', 'num_features_shotgun', 'num_features_16s', 'difference_shotgun', 'difference_16s', 'jaccard'))
    df_results = df_results.set_index('srs')
    df_results['bodytype'] = series_hmp_bodysite
    return df_results

In [14]:
def permutation_test(df_16s, df_shotgun, series_hmp_bodysite, level, num_perms=1, only_bacteria=False):
    df_16s_species = filter_df_to_taxalevel(df_16s, level)
    df_shotgun_species = filter_df_to_taxalevel(df_shotgun, level)
    
    if only_bacteria:
        # Filter to only bacteria kingdom
        df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]
    
    df_16s_species = df_16s_species.T.join(series_hmp_bodysite, how="inner")
    
    # 16s summarize at level
    list_dfs = []
    for group, df in df_16s_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_16s = pd.concat(list_dfs)
    df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
    
    df_shotgun_species = df_shotgun_species.T.join(series_hmp_bodysite)
    
    # Shtogun summarize at level
    list_dfs = []
    for group, df in df_shotgun_species.groupby('bodysite'):
        bodysite = df["bodysite"]
        df = df.drop("bodysite", axis=1)
        df = df.loc[:, (df > 0).sum(axis=0)/df.shape[0] > .1]
        df = pd.DataFrame(clr(multiplicative_replacement2(df)), index=df.index, columns=df.columns)
        #df["bodysite"] = bodysite
        list_dfs.append(df.copy())
    df_clr_shotgun = pd.concat(list_dfs)
    df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
    
    s = 0
    for group, df in df_shotgun_species.groupby('bodysite'):
        for ind1, ind2 in zip(df.index, df.index):
            arr1 = df_clr_shotgun.loc[ind1, :].dropna()
            arr2 = df_clr_16s.loc[ind2, :].dropna()
            s += arr1.corr(arr2)*arr1.corr(arr2)
    yield s
    
    for i in range(num_perms-1):
        s = 0
        for group, df in df_shotgun_species.groupby('bodysite'):
            for ind1, ind2 in zip(df.index, permutation(list(df.index))):
                arr1 = df_clr_shotgun.loc[ind1, :].dropna()
                arr2 = df_clr_16s.loc[ind2, :].dropna()
                s += arr1.corr(arr2)*arr1.corr(arr2)
        yield s

In [15]:
df_hmp_map = pd.read_csv("data/HMP_map.txt", sep='\t', index_col="SRS")

In [16]:
def replace_multi(string, replacement, string_to_replace):
    for s in string_to_replace:
        string = string.replace(s, replacement)
    return string

In [17]:
# Remove duplicates
df_hmp_map['bodysite'] = list(map(lambda x: replace_multi(x, '', ['Left_', 'Right_']), df_hmp_map["HMPBODYSUBSITE"]))
series_hmp_bodysite = df_hmp_map["bodysite"]
series_hmp_bodysite = series_hmp_bodysite[~series_hmp_bodysite.index.duplicated()]

In [18]:
def weighted_jaccard(x, y):
    z = np.vstack((x,y))
    return np.sum(z.min(axis=0)/z.max(axis=0))

In [19]:
# Filter to df shotgun
df_shotgun_clean = cleanup_taxatable(df_shotgun)
df_16s_clean = cleanup_taxatable(df_16s)

# Join up mapping ids
df_shotgun_clean = df_shotgun_clean.T.join(series_hmp_bodysite, how="inner")
df_16s_clean = df_16s_clean.T.join(series_hmp_bodysite, how="inner")

In [20]:
def yield_comparision_stats(df_clr_shotgun, df_clr_16s):
    #('srs', 'pearson', 'spearman', 'num_features_shotgun', 'num_features_16s', 'difference_shotgun', 'difference_16s', 'jaccard', 'w_jaccard')
    for ind in df_clr_shotgun.index:
        arr1 = df_clr_shotgun.loc[ind, :].dropna()
        arr2 = df_clr_16s.loc[ind, :].dropna()
        set_arr1 = set(arr1.index)
        set_arr2 = set(arr2.index)
#         if len(set_arr2.difference(set_arr1)) > 0:
#             print(set_arr2.difference(set_arr1))
        #w_jaccard = weighted_jaccard(arr1.values, arr2[arr1].values)
        yield (ind, arr1.corr(arr2), arr1.corr(arr2, method='spearman'), len(set_arr1), len(set_arr2), len(set_arr1.difference(set_arr2)), len(set_arr2.difference(set_arr1)), len(set_arr1.intersection(set_arr2))/len(set_arr1.union(set_arr2)))

In [21]:
for l in range(2, 8):
    df_comparison = run_experiment(df_16s, df_shotgun, series_hmp_bodysite, l, only_bacteria=True)
    df_comparison.to_csv("results/levels.bacteria.%d.csv" % l)

In [22]:
for l in range(1, 8):
    df_comparison = run_experiment(df_16s, df_shotgun, series_hmp_bodysite, l, only_bacteria=False)
    df_comparison.to_csv("results/levels.%d.csv" % l)


/home/bhillmann/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:2995: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar)
/home/bhillmann/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:2929: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)

In [23]:
# Run 1,000 iterations of the permuation test
perm_results = np.array([i for i in permutation_test(df_16s, df_shotgun, series_hmp_bodysite, l, num_perms=10, only_bacteria=False)])

In [24]:
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig, ax = plt.subplots()

sns.distplot(perm_results, ax=ax)
plt.axvline(x=perm_results[0], color='red')
ax.set_ylabel('frequency')
ax.set_xlabel('permuted sum of R-squared')
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax.annotate("p = {:.3f}".format(1-(perm_results < perm_results[0]).sum()/perm_results.shape[0]), xy=(.6, .9), xycoords=ax.transAxes)

sns.despine(fig)

save_plot(fig, 'pairingmatters_hist')


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

In [25]:
# Filter to df shotgun
df_shotgun_bact = df_shotgun.loc[['k__Bacteria;' in str(_) for _ in df_shotgun.index], :]
df_shotgun_bact = cleanup_taxatable(df_shotgun)
df_16s_clean = cleanup_taxatable(df_16s)

# Join up mapping ids
df_shotgun_bact = df_shotgun_bact.T.join(series_hmp_bodysite, how="inner")
df_16s_clean = df_16s_clean.T.join(series_hmp_bodysite, how="inner")

In [26]:
from collections import defaultdict
shotgun_counter = {}
for group, df in df_shotgun_bact.groupby('bodysite'):
    group_counter = defaultdict(int)
    for c in df.columns:
        if c != 'bodysite':
            group_counter[c.count(';')] += df[c].sum()
    shotgun_counter[group] = group_counter

In [27]:
df_shotgun_levels = pd.DataFrame(shotgun_counter)
df_shotgun_levels = df_shotgun_levels/df_shotgun_levels.sum()

In [28]:
df_shotgun_levels


Out[28]:
Anterior_nares Buccal_mucosa Posterior_fornix Retroauricular_crease Stool Supragingival_plaque Tongue_dorsum
1 0.000751 0.007118 0.002868 0.000215 0.002998 0.001614 0.006157
2 0.001459 0.003413 0.000153 0.001242 0.000844 0.002119 0.004476
3 0.000831 0.007302 0.000796 0.000429 0.051975 0.002121 0.005674
4 0.004011 0.006071 0.000631 0.002011 0.009793 0.004339 0.006658
5 0.025945 0.098925 0.037694 0.026196 0.034289 0.022963 0.049930
6 0.967003 0.877172 0.957858 0.969907 0.900101 0.966843 0.927106

In [29]:
counter_16s = {}
for group, df in df_16s_clean.groupby('bodysite'):
    group_counter = defaultdict(int)
    for c in df.columns:
        if c != 'bodysite':
            group_counter[str(c.count(';'))] += df[c].sum()
    counter_16s[group] = group_counter

In [30]:
df_16s_levels = pd.DataFrame(counter_16s)
df_16s_levels = df_16s_levels/df_16s_levels.sum()

In [31]:
df_16s_levels


Out[31]:
Anterior_nares Buccal_mucosa Posterior_fornix Retroauricular_crease Stool Supragingival_plaque Tongue_dorsum
1 0.000027 0.000000 0.000000 0.000000 0.000034 0.000007 0.000000
2 0.000254 0.001782 0.000000 0.000137 0.000364 0.000322 0.022860
3 0.004707 0.005229 0.000000 0.002302 0.000661 0.000816 0.001975
4 0.000145 0.001453 0.000025 0.000069 0.001285 0.002212 0.000672
5 0.241745 0.215958 0.501423 0.745960 0.032385 0.038234 0.087670
6 0.753121 0.775578 0.498552 0.251532 0.965272 0.958408 0.886822

In [32]:
fig, ax = plt.subplots()
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue'])

df_16s_levels.index =  ["Phylum", "Class", "Order", "Family", "Genus", "Species"]

# Create stacked barplots
df_16s_levels.T.plot.barh(stacked=True, ax=ax, colormap=ListedColormap(palette), legend=True)

save_plot(fig, '16s_levels_annotated')



In [33]:
fig, ax = plt.subplots()
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue'])

df_shotgun_levels.index =  ["Phylum", "Class", "Order", "Family", "Genus", "Species"]

# Create stacked barplots
df_shotgun_levels.T.plot.barh(stacked=True, ax=ax, colormap=ListedColormap(palette), legend=False)

save_plot(fig, 'shotgun_levels_annotated')



In [34]:
filter_df_to_taxalevel(df_shotgun_bact.T, 5).sum(axis=1).sort_values(ascending=False).head()
#df_shotgun_species = df_shotgun_species.loc[['k__Bacteria;' in _ for _ in df_shotgun_species.index], :]


Out[34]:
summary_taxa
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae                    3890423
k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae                       1631089
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae                    1557696
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae    1235501
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae                 590526
dtype: int64

In [35]:
x = 'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae'
x = 'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae'
'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae'


Out[35]:
'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae'

In [36]:
set_shotgun = set(df_shotgun_bact.columns)
set_df_16s = set(df_16s_clean.columns)

# for col in set_shotgun.difference(set_df_16s):
#     df_16s_clean[col] = 0

for col in set_df_16s.difference(set_shotgun):
    df_shotgun_bact[col] = 0
    
df_shotgun_bact = df_shotgun_bact[list(set_df_16s)]

In [37]:
palette = sns.xkcd_palette(['royal blue', 'light red', 'ochre', 'black', 'grey', 'sky blue', 'dark turquoise', 'emerald', 'light purple', 'gold', 'night blue', 'green', 'maroon', 'orchid', 'deep pink', 'light grey blue', 'old rose', 'light pink', 'pale aqua', 'yellow', 'lime green', 'dark orange', 'brown'])
color_map = ListedColormap(palette)

In [38]:
fig, ax = plt.subplots()

df = df_shotgun_bact.T.loc[[x in _ for _ in df_shotgun_bact.T.index], :].T.head(25)
df += 1
df = (df.T/df.T.sum()).T
df.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax)

save_plot(fig, 'shotgun_prevotella')



In [39]:
fig, ax = plt.subplots()

df = df_16s_clean.T.loc[[x in _ for _ in df_16s_clean.T.index], :].T.head(25)
df += 1
df = (df.T/df.T.sum()).T
df.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax, figsize=(12,12))
#fig1 = custom_legend(fig, color_map, df.columns)
#save_plot(fig, '16s_prevotella', artists=(fig1,))
save_plot(fig, '16s_prevotella')



In [40]:
#df_x = df_16s_clean.T.loc[[x in _ for _ in df_16s_clean.T.index], :].T
df_x = df_16s_clean.T.loc[['Bacteria' in _ for _ in df_16s_clean.T.index], :].T
df_x = df_x.reindex(sorted(df_x.columns), axis=1)
df_x += 1
df_x = (df_x.T/df_x.T.sum()).T
#df_y = df_shotgun_bact.T.loc[[x in _ for _ in df_shotgun_bact.T.index], :].T
df_y = df_shotgun_bact.T.loc[['Bacteria' in _ for _ in df_shotgun_bact.T.index], :].T
df_y = df_y.reindex(sorted(df_y.columns), axis=1)
df_y += 1
df_y = (df_y.T/df_y.T.sum()).T


fig, ax = plt.subplots()

for i in df_x.index:
    if i in df_x.index and i in df_y.index:
        plt.plot(df_x.loc[i, :], df_y.loc[i, :], 'r+')



In [41]:
list_16s_dfs = []
list_shotgun_dfs = []
dict_missing_shotgun = defaultdict(list)
dict_missing_16s = defaultdict(list)
for (group_16, df_16), (group_shotgun, df_shotgun) in zip(df_16s_clean.groupby('bodysite'), df_shotgun_bact.groupby('bodysite')):
    # Run the prevalence filter
    df_16 = df_16.drop("bodysite", axis=1)
    df_16 = df_16.loc[:, (df_16 > 0).sum(axis=0)/df_16.shape[0] > .1]

    # Run the prevlance filter
    df_shotgun = df_shotgun.drop("bodysite", axis=1)
    df_shotgun = df_shotgun.loc[:, (df_shotgun > 0).sum(axis=0)/df_shotgun.shape[0] > .1]
    
    # Reindex both dataframes according to missing columns
    set_shotgun = set(df_shotgun.columns)
    set_df_16s = set(df_16.columns)

    for col in set_df_16s.difference(set_shotgun):
        df_shotgun[col] = 0
        for index in df_shotgun.index:
            dict_missing_shotgun[index].append(col)
    
    for col in set_shotgun.difference(set_df_16s):
        df_16[col] = 0
        for index in df_16.index:
            dict_missing_16s[index].append(col)

    #df_shotgun = df_shotgun[list(set_df_16s)]
    #df_16s = df_16s[list(set_shotgun)]
    
#    df_16 += .001
#    d
    mat_16 = multiplicative_replacement2(df_16, delta=.2)
    df_16 = pd.DataFrame(clr(mat_16), index=df_16.index, columns=df_16.columns)
    list_16s_dfs.append(df_16.copy())
#    df_shotgun += .001
    mat_shotgun = multiplicative_replacement2(df_shotgun, delta=.2)
#    df_shotgun = (df_shotgun.T/df_shotgun.T.sum()).T
    df_shotgun = pd.DataFrame(clr(mat_shotgun), index=df_shotgun.index, columns=df_shotgun.columns)
    list_shotgun_dfs.append(df_shotgun.copy())

df_clr_shotgun = pd.concat(list_shotgun_dfs)
df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
df_clr_16s = pd.concat(list_16s_dfs)
df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
df_clr_shotgun= df_clr_shotgun.reindex(sorted(df_clr_shotgun.columns), axis=1)
df_clr_16s= df_clr_16s.reindex(sorted(df_clr_16s.columns), axis=1)

In [42]:
df_colors = pd.DataFrame(np.zeros(df_clr_shotgun.shape), index=df_clr_shotgun.index, columns=df_clr_shotgun.columns)

for k, vs in dict_missing_shotgun.items():
    for v in vs:
        df_colors.loc[k, v] = 1
    
for k, vs in dict_missing_16s.items():
    for v in vs:
        df_colors.loc[k, v] = 2

In [43]:
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig, ax = plt.subplots()
    
palette = sns.color_palette("colorblind", 3)

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)
        ax.scatter(df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

leg1 = custom_legend(ax, palette, ["Both", "16S Only", "Shotgun Only"])
ax.set_ylabel("16S Abundance")
ax.set_xlabel("Shotgun Abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance", artists=(leg1,))



In [44]:
list_16s_dfs = []
list_shotgun_dfs = []
dict_missing_shotgun = defaultdict(list)
dict_missing_16s = defaultdict(list)
for (group_16, df_16), (group_shotgun, df_shotgun) in zip(df_16s_clean.groupby('bodysite'), df_shotgun_clean.groupby('bodysite')):
    # Run the prevalence filter
    df_16 = df_16.drop("bodysite", axis=1)
    df_16 = df_16.loc[:, (df_16 > 0).sum(axis=0)/df_16.shape[0] > .1]

    # Run the prevlance filter
    df_shotgun = df_shotgun.drop("bodysite", axis=1)
    df_shotgun = df_shotgun.loc[:, (df_shotgun > 0).sum(axis=0)/df_shotgun.shape[0] > .1]
    
    # Reindex both dataframes according to missing columns
    set_shotgun = set(df_shotgun.columns)
    set_df_16s = set(df_16.columns)

    for col in set_df_16s.difference(set_shotgun):
        df_shotgun[col] = 0
        for index in df_shotgun.index:
            dict_missing_shotgun[index].append(col)
    
    for col in set_shotgun.difference(set_df_16s):
        df_16[col] = 0
        for index in df_16.index:
            dict_missing_16s[index].append(col)

    #df_shotgun = df_shotgun[list(set_df_16s)]
    #df_16s = df_16s[list(set_shotgun)]
    
    df_16 += .001
    df_16 = (df_16.T/df_16.T.sum()).T
#     df_16 = pd.DataFrame(clr(multiplicative_replacement2(df_16, delta=.2)), index=df_16.index, columns=df_16.columns)
    list_16s_dfs.append(df_16.copy())
    df_shotgun += .001
    df_shotgun = (df_shotgun.T/df_shotgun.T.sum()).T
#     df_shotgun = pd.DataFrame(clr(multiplicative_replacement2(df_shotgun, delta=.2)), index=df_shotgun.index, columns=df_shotgun.columns)
    list_shotgun_dfs.append(df_shotgun.copy())

df_clr_shotgun = pd.concat(list_shotgun_dfs)
df_clr_shotgun = df_clr_shotgun.dropna(axis=1, how="all")
df_clr_16s = pd.concat(list_16s_dfs)
df_clr_16s = df_clr_16s.dropna(axis=1, how="all")
df_clr_shotgun= df_clr_shotgun.reindex(sorted(df_clr_shotgun.columns), axis=1)
df_clr_16s= df_clr_16s.reindex(sorted(df_clr_16s.columns), axis=1)

In [45]:
df_colors = pd.DataFrame(np.zeros(df_clr_shotgun.shape), index=df_clr_shotgun.index, columns=df_clr_shotgun.columns)

for k, vs in dict_missing_shotgun.items():
    for v in vs:
        df_colors.loc[k, v] = 1
    
for k, vs in dict_missing_16s.items():
    for v in vs:
        if 'k__Bacteria' in v:
            df_colors.loc[k, v] = 2    
        else:
            df_colors.loc[k, v] = 3

In [46]:
# Join up mapping ids
dfgroup_clr_shotgun = df_clr_shotgun.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()
dfgroup_clr_16s = df_clr_16s.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()

In [47]:
def yield_summaries(df, df_colors):
    #['shotgun_both','shotgun_shotgun','shotgun_16s_bacteria','shotgun_16s','16s_both','16s_shotgun','16s_16s_bacteria','16s_16s']
    for i in df_clr_shotgun.index:
        if i in df_clr_shotgun.index and i in df_clr_16s.index:
            yield [df.loc[i, :][df_colors.loc[i, :] == 0].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 1].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 2].dropna().sum(),
             df.loc[i, :][df_colors.loc[i, :] == 3].dropna().sum()]

In [48]:
df_prop_shotgun = pd.DataFrame(yield_summaries(df_clr_shotgun, df_colors), columns=['both','16s only','shotgun bacteria only','shotgun non-bacteria only'], index=df_clr_shotgun.index)
df_prop_16s = pd.DataFrame(yield_summaries(df_clr_16s, df_colors), columns=['both','16s only','shotgun bacteria only','shotgun non-bacteria only'], index=df_clr_shotgun.index)

In [49]:
df_grouped_shotgun = df_prop_shotgun.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()
df_grouped_16s = df_prop_16s.join(series_hmp_bodysite, how="inner").groupby('bodysite').mean()

In [51]:
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)


# draw vertical line from (70,100) to (70, 250)
#ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
#ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)


leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance")



In [74]:
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        
ax.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)


# draw vertical line from (70,100) to (70, 250)
#ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
#ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)


#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shallow shotgun only"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

sns.despine(offset=10, trim=True)

save_plot(fig, "scatterplot_abundance_no_purple", size_inches=(4,4))



In [58]:
fig, ax = plt.subplots()

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax.xaxis.set_ticks(np.arange(0, .11, .02))
ax.yaxis.set_ticks(np.arange(0, .11, .02))
ax.set_ylim(-0.01, .11)
ax.set_xlim(-0.01, .11)
ax.axis=['equal']
        
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax.set_ylabel("Shallow shotgun abundance")
ax.set_xlabel("16S abundance")

save_plot(fig, "inset_scatterplot_abundance")



In [59]:
fig, ax = plt.subplots()

color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax, colormap=color_map)
ax.set_xlabel("Mean abundance")
ax.yaxis.tick_right()

stylize_axes(ax)

save_plot(fig, "stackedhbar_abundances")


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [60]:
df_grouped_16s.plot.barh(stacked=True, legend=False)


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbb5de51908>

In [61]:
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(5.5,2))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")
ax2.set_yticklabels("")

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
        
for p in ax2.patches:
    if p.get_width() >= .08:
        ax2.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
    

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])

save_plot(fig, "stacked_hbar_both_abundance", size_inches=(11,4), artists=(leg1,))


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [71]:
color_map_no_purple = ListedColormap(palette[:3] + [palette[2]])

with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(4,4))
    ax1 = plt.subplot(111)
    
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

#stylize_axes(ax1)

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=8, textcoords='offset points', horizontalalignment='center')    

sns.despine(offset=10, trim=True)
        
save_plot(fig, "stacked_hbar_both_abundance_no_purple", size_inches=(4, 4))



In [ ]:
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(5.5,2))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")
ax2.set_yticklabels("")

for p in ax1.patches:
    if p.get_width() >= .08:
        ax1.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
        
for p in ax2.patches:
    if p.get_width() >= .08:
        ax2.annotate("%.2f" % p.get_width(), (p.get_x()+p.get_width()/2, p.get_y()), xytext=(0, 4), fontsize=10, textcoords='offset points', horizontalalignment='center')
    

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])

save_plot(fig, "stacked_hbar_both_abundance", size_inches=(11,4), artists=(leg1,))

In [55]:
fig, ax = plt.subplots()

color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax, colormap=color_map)
ax.set_xlabel("Mean abundance")
ax.yaxis.tick_right()

stylize_axes(ax)

save_plot(fig, "stackedhbar_abundances")


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [56]:
df_grouped_shotgun['datatype'] = 'shotgun'
df_grouped_16s['datatype'] = '16s'

df_grouped_prop = pd.concat((df_grouped_shotgun, df_grouped_16s))

In [57]:
df_grouped_prop


Out[57]:
both 16s only shotgun bacteria only shotgun non-bacteria only datatype
Anterior nares 0.468352 1.812641e-07 0.520749 0.010900 shotgun
Buccal mucosa 0.870817 4.024719e-08 0.126397 0.002786 shotgun
Posterior fornix 0.381709 4.007048e-08 0.618235 0.000056 shotgun
Retroauricular crease 0.200967 7.211483e-08 0.795375 0.003658 shotgun
Stool 0.890525 9.022622e-08 0.109244 0.000231 shotgun
Supragingival plaque 0.912821 2.002341e-08 0.087128 0.000051 shotgun
Tongue dorsum 0.857125 2.004793e-08 0.142757 0.000118 shotgun
Anterior_nares 0.998972 8.296572e-04 0.000170 0.000028 16s
Buccal_mucosa 0.999774 1.056980e-04 0.000114 0.000007 16s
Posterior_fornix 0.999850 4.410212e-05 0.000103 0.000003 16s
Retroauricular_crease 0.999576 3.415784e-04 0.000069 0.000013 16s
Stool 0.998425 1.480189e-03 0.000094 0.000001 16s
Supragingival_plaque 0.999733 1.954522e-04 0.000070 0.000002 16s
Tongue_dorsum 0.999749 1.704212e-04 0.000078 0.000002 16s

In [58]:
df_grouped_prop.plot.barh(stacked=True, legend=True)


Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f69f31f3710>

In [59]:
grid = sns.FacetGrid(df_grouped_prop, row="datatype", margin_titles=True)
def stacked_bar(**kwargs):
    print(kwargs)
    return df.plot.barh(stacked=True, legend=True)
grid.map(stacked_bar)


{'color': (0.12156862745098039, 0.4666666666666667, 0.7058823529411765)}
{'color': (0.12156862745098039, 0.4666666666666667, 0.7058823529411765)}
Out[59]:
<seaborn.axisgrid.FacetGrid at 0x7f69f30d8e10>

In [60]:
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
ax3 = plt.subplot(223)
ax4 = plt.subplot(234)

#ax1
sns.distplot(perm_results, ax=ax1)
ax1.axvline(x=perm_results[0], color='red')
ax1.set_ylabel('frequency')
ax1.set_xlabel('permuted sum of R-squared')
ax1.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax1.annotate("p = {:.3f}".format(1-(perm_results < perm_results[0]).sum()/perm_results.shape[0]), xy=(.6, .9), xycoords=ax1.transAxes)

#ax2
color_map = ListedColormap(palette)
df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax2 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax2, colormap=color_map)
ax2.set_xlabel("Mean abundance")

#ax3

#ax4


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[60]:
Text(0.5,0,'Mean abundance')

In [61]:
with plt.style.context(('seaborn-ticks', 'seaborn-paper')):
    fig = plt.figure(figsize=(11, 8))
    ax1 = plt.subplot(121)
    ax2 = plt.subplot(122)

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax2.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax2.xaxis.set_ticks(np.arange(0, .11, .02))
ax2.yaxis.set_ticks(np.arange(0, .11, .02))
ax2.set_ylim(-0.01, .11)
ax2.set_xlim(-0.01, .11)
ax2.axis=['equal']
        
lims = [
    np.min([ax2.get_xlim(), ax2.get_ylim()]),  # min of both axes
    np.max([ax2.get_xlim(), ax2.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax2.set_aspect('equal')
ax2.set_xlim(lims)
ax2.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax2.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax2.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax1.set_ylabel("Shallow shotgun abundance")
ax1.set_xlabel("16S abundance")

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax1.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax1.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax1.yaxis.set_ticks(np.arange(0, 1.01, .2))
ax1.axis=['equal']
        
lims = [
    np.min([ax1.get_xlim(), ax1.get_ylim()]),  # min of both axes
    np.max([ax1.get_xlim(), ax1.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax1.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax1.set_aspect('equal')
ax1.set_xlim(lims)
ax1.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax1.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax1.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

from scipy.stats import pearsonr

x = df_clr_16s.loc[df_clr_shotgun.index, :].values.flatten()
x = x[~np.isnan(x)]

y = df_clr_shotgun.values.flatten()
y = y[~np.isnan(y)]

r, p = pearsonr(x, y)
r = r*r

ax1.annotate(r"$r^2 = {:.2f}$ $p = {:.4f}$".format(r,p), xy=(.1, .9), xycoords=ax1.transAxes)
#ax1.annotate(r"$p = {:.4f}$".format(p), xy=(.4, .9), xycoords=ax1.transAxes)
stylize_axes(ax1)
stylize_axes(ax2)

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
#ax2.set_ylabel("Shallow shotgun abundance")
ax2.set_xlabel("16S abundance")
plt.tight_layout()

sns.despine(offset=10, trim=True)

#fig.text(.5, -0.1, "16S Abundance", ha='center', va='center')
save_plot(fig, "scatterplot_both_abundance", size_inches=(11,8), artists=(leg1,))


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [62]:
from scipy.stats import pearsonr

x = df_clr_16s.loc[df_clr_shotgun.index, :].values.flatten()
x = x[~np.isnan(x)]

y = df_clr_shotgun.values.flatten()
y = y[~np.isnan(y)]

r, p = pearsonr(x, y)
r = r*r

In [63]:
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(x, y)

In [64]:
slope, intercept, r_value, p_value


Out[64]:
(0.49618829112230212, 0.00092021415951870243, 0.53704785635882046, 0.0)

In [65]:
fig = plt.figure(figsize=(12,8))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
ax3 = plt.subplot(223)
ax4 = plt.subplot(224)

palette = sns.xkcd_palette(["windows blue", "amber", "faded green", "dusty purple"])

df_grouped_shotgun.index = [_.replace("_", " ") for _ in df_grouped_shotgun.index]

ax1 = df_grouped_shotgun.plot.barh(stacked=True, legend=False, ax=ax1, colormap=color_map)
ax1.set_xlabel("Mean abundance")
ax1.set_title("Shallow shotgun")
#ax1.yaxis.tick_right()

stylize_axes(ax1)

df_grouped_16s.plot.barh(stacked=True, legend=False, colormap=color_map, ax=ax2)
stylize_axes(ax2)
ax2.set_ylabel("")
ax2.set_xlabel("Mean abundance")
ax2.set_title("16S")

plt.setp(ax2.get_yticklabels(), visible=False)


for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax3.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax3.xaxis.set_ticks(np.arange(0, .11, .02))
ax3.yaxis.set_ticks(np.arange(0, .11, .02))
ax3.set_ylim(-0.01, .11)
ax3.set_xlim(-0.01, .11)
ax3.axis=['equal']
        
lims = [
    np.min([ax3.get_xlim(), ax3.get_ylim()]),  # min of both axes
    np.max([ax3.get_xlim(), ax3.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax3.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax3.set_xlim(lims)
ax3.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax3.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax3.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

#leg1 = custom_legend(ax, palette, ["Both", "16S only", "Shotgun only", "Shotgun non-bacteria"])
ax3.set_ylabel("Shallow shotgun abundance")
ax3.set_xlabel("16S abundance")

for i in df_clr_shotgun.index:
    if i in df_clr_shotgun.index and i in df_clr_16s.index:
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 0].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 0].dropna(), color=palette[0], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 1].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 1].dropna(), color=palette[1], marker="o", alpha=.5, zorder=3, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 2].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 2].dropna(), color=palette[2], marker="o", alpha=.5, zorder=1, edgecolor='face', s=10)
        ax4.scatter(df_clr_16s.loc[i, :][df_colors.loc[i, :] == 3].dropna(), df_clr_shotgun.loc[i, :][df_colors.loc[i, :] == 3].dropna(), color=palette[3], marker="o", alpha=.5, zorder=2, edgecolor='face', s=10)

ax4.xaxis.set_ticks(np.arange(0, 1.01, .2))
ax4.yaxis.set_ticks(np.arange(0, 1.01, .2))
        
lims = [
    np.min([ax4.get_xlim(), ax4.get_ylim()]),  # min of both axes
    np.max([ax4.get_xlim(), ax4.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax4.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax4.set_aspect('equal')
ax4.set_xlim(lims)
ax4.set_ylim(lims)

# draw vertical line from (70,100) to (70, 250)
ax4.plot([0, .1], [.1, .1], 'k--', alpha=.75, zorder=4)

# draw diagonal line from (70, 90) to (90, 200)
ax4.plot([.1, .1], [0, .1], 'k--', alpha=.75, zorder=4)

stylize_axes(ax3)
stylize_axes(ax4)

leg1 = custom_legend(ax2, palette, ["Both", "16S only", "Shallow shotgun only", "Shallow shotgun non-bacteria"])
#ax2.set_ylabel("Shallow shotgun abundance")
ax4.set_xlabel("16S abundance")
#ax1.set_aspect('equal')
#ax2.set_aspect('equal')
ax3.set_aspect('equal')
ax4.set_aspect('equal')

plt.tight_layout()

#fig.text(.5, -0.1, "16S Abundance", ha='center', va='center')
save_plot(fig, "scatterplot_barplot_both_abundance", size_inches=(12,8), artists=(leg1,))


/home/bhillmann/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [66]:
print(plt.style.available)


['Solarize_Light2', '_classic_test', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn', 'tableau-colorblind10']

In [ ]: