author: lukethompson@gmail.com
date: 19 Sep 2017
language: Python 3.5
conda environment: emp-py3
license: unlicensed

nestedness_binary_heatmaps.ipynb


In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%run ../../code/colors-and-styles/empcolors.py

In [2]:
# seaborn plot settings
sns.set_context('talk')
sns.set_style('white')

In [3]:
# input file paths (.csv removed)
paths_phylum_empo2 = [
    '../../data/nestedness/nest_phylum_Animal',
    '../../data/nestedness/nest_phylum_Plant',
    '../../data/nestedness/nest_phylum_Saline',
    '../../data/nestedness/nest_phylum_Non-saline']
    
paths_phylum_all = ['../../data/nestedness/nest_phylum_allsamples']

In [4]:
def plot_nestedness_big(path, taxlevel, xmax, ymax, legendloc, legendcol):
    df = pd.read_csv('%s.csv' % path)
    # df = df[df['OBSERVATION_ID'] != 'unclassified'] # if we want to remove unclassified phylum
    empolevel = path.split('/')[-1].split('_')[-1]
    if xmax == 'auto':
        xmax = df.SAMPLE_RANK.max()
    if ymax == 'auto':
        ymax = df.OBSERVATION_RANK.max()
    fig = plt.figure(figsize=(500/30, 80/12.7), facecolor=None) # xmax/30 ymax/12.7
    for empo3 in np.sort(df.empo_3.unique()):
        plt.scatter(df[df.empo_3 == empo3].SAMPLE_RANK, df[df.empo_3 == empo3].OBSERVATION_RANK, marker='|', 
                    linewidths=2, label=empo3, color=get_empo_cat_color(empo3))
    plt.xlabel('%s samples (sorted by richness)' % empolevel, fontsize=24)
    plt.ylabel('%s (sorted by prevalence)' % taxlevel, fontsize=24)
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.axis([0, xmax+1, 0, ymax+0.8])
    #plt.legend(loc=legendloc, ncol=legendcol, markerscale=4, handletextpad=0, fontsize=18) # bbox_to_anchor=(0.5, 1+0.08*80/ymax), 
    #plt.tight_layout()
    fig.patch.set_alpha(0.0)
    plt.savefig('%s.pdf' % path)
    return df, empolevel

In [5]:
def plot_nestedness(path, taxlevel, xmax, ymax, legendloc, legendcol):
    df = pd.read_csv('%s.csv' % path)
    # df = df[df['OBSERVATION_ID'] != 'unclassified'] # if we want to remove unclassified phylum
    empolevel = path.split('/')[-1].split('_')[-1]
    if xmax == 'auto':
        xmax = df.SAMPLE_RANK.max()
    if ymax == 'auto':
        ymax = df.OBSERVATION_RANK.max()
    fig = plt.figure(figsize=(400/30, 80/12.7), facecolor=None) # xmax/30 ymax/12.7
    for empo3 in np.sort(df.empo_3.unique()):
        plt.scatter(df[df.empo_3 == empo3].SAMPLE_RANK, df[df.empo_3 == empo3].OBSERVATION_RANK, marker='|', 
                    linewidths=2, label=empo3, color=get_empo_cat_color(empo3))
    plt.xlabel('%s samples (sorted by richness)' % empolevel, fontsize=24)
    plt.ylabel('%s (sorted by prevalence)' % taxlevel, fontsize=24)
    plt.tick_params(axis='both', which='major', labelsize=20)
    plt.axis([0, xmax+1, 0, ymax+0.8])
    #plt.legend(loc=legendloc, ncol=legendcol, markerscale=4, handletextpad=0, fontsize=18) # bbox_to_anchor=(0.5, 1+0.08*80/ymax), 
    #plt.tight_layout()
    fig.patch.set_alpha(0.0)
    plt.savefig('%s.pdf' % path)
    return df, empolevel

In [6]:
def top_taxa(df):
    return df[['OBSERVATION_RANK', 'OBSERVATION_ID']].sort_values(
        'OBSERVATION_RANK', ascending=False).drop_duplicates().reset_index(drop=True)

In [7]:
panels_dict = {}
for path in paths_phylum_all:
    df, empolevel = plot_nestedness_big(path, 'Phyla', 'auto', 'auto', 3, 4)
    panels_dict[empolevel] = df



In [8]:
for path in paths_phylum_empo2:
    df, empolevel = plot_nestedness(path, 'Phyla', 'auto', 'auto', 3, 1)
    panels_dict[empolevel] = df



In [9]:
for key in sorted(panels_dict):
    print('\n' + key)
    print(panels_dict[key]['OBSERVATION_ID'].value_counts().head(8))


Animal
k__Bacteria;p__Proteobacteria     639
k__Bacteria;p__Firmicutes         631
k__Bacteria;p__Actinobacteria     608
k__Bacteria;p__Bacteroidetes      592
k__Bacteria;p__Cyanobacteria      468
unclassified                      446
k__Bacteria;p__Verrucomicrobia    377
k__Bacteria;p__Fusobacteria       321
Name: OBSERVATION_ID, dtype: int64

Non-saline
k__Bacteria;p__Proteobacteria     595
k__Bacteria;p__Bacteroidetes      587
k__Bacteria;p__Actinobacteria     571
unclassified                      565
k__Bacteria;p__Cyanobacteria      561
k__Bacteria;p__Acidobacteria      550
k__Bacteria;p__Verrucomicrobia    548
k__Bacteria;p__Chloroflexi        546
Name: OBSERVATION_ID, dtype: int64

Plant
k__Bacteria;p__Proteobacteria     377
k__Bacteria;p__Cyanobacteria      369
k__Bacteria;p__Actinobacteria     322
k__Bacteria;p__Bacteroidetes      310
k__Bacteria;p__Planctomycetes     286
k__Bacteria;p__Verrucomicrobia    282
unclassified                      278
k__Bacteria;p__Firmicutes         258
Name: OBSERVATION_ID, dtype: int64

Saline
k__Bacteria;p__Proteobacteria     386
k__Bacteria;p__Bacteroidetes      384
k__Bacteria;p__Cyanobacteria      368
k__Bacteria;p__Actinobacteria     360
k__Bacteria;p__Verrucomicrobia    359
unclassified                      352
k__Bacteria;p__Planctomycetes     337
k__Bacteria;p__Firmicutes         298
Name: OBSERVATION_ID, dtype: int64

allsamples
k__Bacteria;p__Proteobacteria     1997
k__Bacteria;p__Bacteroidetes      1873
k__Bacteria;p__Actinobacteria     1861
k__Bacteria;p__Cyanobacteria      1766
k__Bacteria;p__Firmicutes         1730
unclassified                      1641
k__Bacteria;p__Verrucomicrobia    1566
k__Bacteria;p__Planctomycetes     1410
Name: OBSERVATION_ID, dtype: int64

In [ ]: