Single-cell Paper: Tara Heatmap and Histogram

Histograms for Proch and Pelag of all gene clusters and those missing in Tara metagenomes
Heatmaps for Proch and Pelag of Tara count vs cluster size



In [14]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
import math
from sys import argv

Seaborn settings



In [15]:

    
sns.set_style("whitegrid")
sns.set_style("ticks")
sns.set_context("poster")

Function to read tsv of clusters and fix cluster size column



In [16]:

    
def read_clusters_tsv(path):
    df = pd.read_csv(path, sep='\t', header=0, index_col=0)
    df = df.rename(columns={'Unnamed: 1': 'cluster_size'})
    col = [int(re.sub(r'SIZE:([0-9]*)', r'\1', i)) for i in df['cluster_size']]
    df['cluster_size'] = col
    return df

Function to plot histograms of cluster size



In [17]:

    
def plot_hist_tara(df_all_clusters, df_missing_clusters, bin_max, bin_size, max_frac, num_genomes, ymax, fig_path):
    fig, ax = plt.subplots()
    sns.distplot(df_all_clusters.cluster_size, kde=False, color='b', bins=np.arange(0,bin_max+bin_size,bin_size), label='All gene clusters')
    sns.distplot(df_missing_clusters.cluster_size, kde=False, color='r', bins=np.arange(0,bin_max+bin_size,bin_size), label='Missing from Tara metagenomes')
    sns.despine(offset=10)
    xticks = np.array(np.arange(0, max_frac, 0.2) * num_genomes)
    xticklabels = xticks / num_genomes
    plt.xticks(xticks, xticklabels)
    plt.xlim(0, max_frac*num_genomes)
    plt.xlabel('Cluster copy number (per genome)')
    plt.yscale('log')
    plt.ylim(0.5, 1e4)
    yticks = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
    yticklabels = ['1', '2', '5', '10', '20', '50', '100', '200', '500', '1000', '2000', '5000', '10000']
    plt.yticks(yticks, yticklabels)
    plt.ylabel('Number of clusters')
    plt.legend()
    fig.set_size_inches(12, 8)
    plt.savefig(fig_path)

Function to merge cluster counts



In [18]:

    
def merge_cluster_counts(path):
    # Paths of input files, containing cluster counts
    paths = pd.Series.from_csv(path, header=-1, sep='\t', index_col=None)
    # Data frame containing all samples cluster counts (NaN if missing)
    pieces = []
    for path in paths:
        fullpath = "/Users/luke/singlecell/tara/PROK-139/%s" % path
        counts = pd.DataFrame.from_csv(fullpath, header=-1, sep='\t', index_col=0)
        pieces.append(counts)
    frame = pd.concat(pieces, axis=1)
    headings = paths.tolist()
    frame.columns = headings
    return frame

Function to make two-column dataframe of cluster_size and tara_count



In [19]:

    
def make_df_size_count(df_all, df_counts):
    df_size_count = pd.DataFrame()
    df_size_count['cluster_size'] = df_all.cluster_size
    df_size_count['tara_count'] = df_counts.count(axis=1)
    df_size_count.fillna(0, inplace=True)
    df_size_count.tara_count = df_size_count.tara_count.astype(int)
    return(df_size_count)

Function to make groupby_size_count frame



In [20]:

    
def make_groupby_size_count(df_size_count):
    # matrix style of cluster_size and tara_count
    groupby_size_count = df_size_count.groupby(['cluster_size', 'tara_count']).size().unstack().transpose()
    max_size = df_size_count.cluster_size.max()
    max_count = int(df_size_count.tara_count.max())
    # add empty columns
    for i in range(1, max_size+1):
        if not i in groupby_size_count:
            groupby_size_count[i] = np.nan
    # add empty rows (might not be any)
    for i in range(1, max_count+1):
        if not i in groupby_size_count.index:
            groupby_size_count.loc[i] = np.nan
    groupby_size_count.sort_index(axis=1, inplace=True)
    #groupby_size_count.fillna(0, inplace=True)
    return(groupby_size_count, max_size, max_count)

Function to plot heatmap



In [21]:

    
def plot_heatmap_tara(df, num_genomes, max_size, max_count, max_frac, xinches, yinches, species, fig_path):
    fig, ax = plt.subplots()
    myfig = sns.heatmap(np.log(df.iloc[::-1]), square=False, robust=True, cmap='inferno')
    xticks = np.array(np.arange(0, 5, 0.2) * num_genomes)
    xticklabels = xticks / num_genomes
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Cluster copy number (per genome)')
    ylabels = np.array(df.index[0::20].tolist() + [max_count])
    ax.set_yticks(ylabels+0.5)
    ax.set_yticklabels(ylabels)
    ax.set_ylabel('Tara metagenomes cluster found in')
    ax.set_title('Density heatmap of %s gene clusters' % species)
    ax.axis([0, max_frac*num_genomes, 0, max_count+2])
    fig.set_size_inches(xinches, yinches)
    fig.savefig(fig_path)

PROCHLOROCOCCUS



In [22]:

    
# missing clusters
path = '/Users/luke/singlecell/tara/table_missing_proch_all_1e-5.tsv'
num_genomes = 145
fig_full = '/Users/luke/singlecell/tara/hist_missing_proch_all_1e-5_full.pdf'
fig_zoom = '/Users/luke/singlecell/tara/hist_missing_proch_all_1e-5_zoom.pdf'

df_missing_clusters = read_clusters_tsv(path)



In [23]:

    
# all clusters
path = '/Users/luke/singlecell/tara/table_all_proch.tsv'

df_all_clusters = read_clusters_tsv(path)



In [24]:

    
# tara counts by cluster
species = 'proch'
evalue = '1e-5'
path = '/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue)

df_counts = merge_cluster_counts(path)
# col_SRF = [col for col in list(df_counts.columns) if 'SRF' in col]
# df_counts = df_counts[col_SRF]

Histograms of Prochlorococcus gene cluster size among clusters MISSING in all Tara samples



In [26]:

    
# full hist
plot_hist_tara(df_all_clusters, df_missing_clusters, 240, 5.8, 1.6001, num_genomes, 10000, fig_full)



In [27]:

    
# zoom hist
# plot_hist_tara(df_all_clusters, df_missing_clusters, 1.6201, num_genomes, 30, fig_zoom)

Density heatmap of Prochlorococcus gene clusters by cluster size (numer of genomes) and presence/absence in 139 Tara prokaryote metagenomes



In [28]:

    
df_size_count = make_df_size_count(df_all_clusters, df_counts)
groupby_size_count, max_size, max_count = make_groupby_size_count(df_size_count)



In [29]:

    
print groupby_size_count.max().max()



In [30]:

    
# def fourth_root(num):
#     return num**0.25
# def square_root(num):
#     return num**0.5
# groupby_size_count_fourth_root = groupby_size_count_nonan.apply(fourth_root)
# groupby_size_count_square_root = groupby_size_count_nonan.apply(square_root)



In [31]:

    
fig_path = '/Users/luke/singlecell/tara/heatmap_tara_proch.pdf'
plot_heatmap_tara(groupby_size_count, num_genomes, max_size, max_count, 1.1, 12, 8, 'Prochlorococcus', fig_path) # 1.1 was 1.62



In [32]:

    
# jointplot of cluster_size and tara_count
fig = sns.jointplot(x='cluster_size', y='tara_count', data=df_size_count)









    



/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/numpy/lib/function_base.py:564: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  n = np.zeros(bins, ntype)
/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/numpy/lib/function_base.py:600: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  n += np.bincount(indices, weights=tmp_w, minlength=bins).astype(ntype)

PELAGIBACTER



In [33]:

    
# missing clusters
path = '/Users/luke/singlecell/tara/table_missing_pelag_all_1e-5.tsv'
num_genomes = 47
fig_full = '/Users/luke/singlecell/tara/hist_missing_pelag_all_1e-5_full.pdf'
fig_zoom = '/Users/luke/singlecell/tara/hist_missing_pelag_all_1e-5_zoom.pdf'

df_missing_clusters = read_clusters_tsv(path)



In [34]:

    
# all clusters
path = '/Users/luke/singlecell/tara/table_all_pelag.tsv'

df_all_clusters = read_clusters_tsv(path)









    



/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2825: DtypeWarning: Columns (5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):



In [35]:

    
# tara counts by cluster 
species = 'pelag'
evalue = '1e-5'
path = '/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue)

df_counts = merge_cluster_counts(path)
# col_SRF = [col for col in list(df_counts.columns) if 'SRF' in col]
# df_counts = df_counts[col_SRF]



In [36]:

    
df_counts.shape









    Out[36]:





(4475, 139)

Histograms of Pelagibacter gene cluster size among clusters MISSING in all Tara samples



In [37]:

    
# full hist
plot_hist_tara(df_all_clusters, df_missing_clusters, 100, 1.88, 1.6001, num_genomes, 5000, fig_full)



In [38]:

    
# zoom hist
#plot_hist_tara(df_all_clusters, df_missing_clusters, 1.6201, num_genomes, 30, fig_zoom)

Density heatmap of Pelagibacter gene clusters by cluster size (numer of genomes) and presence/absence in 139 Tara prokaryote metagenomes



In [39]:

    
df_size_count = make_df_size_count(df_all_clusters, df_counts)
groupby_size_count, max_size, max_count = make_groupby_size_count(df_size_count)



In [40]:

    
print groupby_size_count.max().max()



In [44]:

    
groupby_size_count.max().max()









    Out[44]:





756.0



In [41]:

    
fig_path = '/Users/luke/singlecell/tara/heatmap_tara_pelag.pdf'
plot_heatmap_tara(groupby_size_count, num_genomes, max_size, max_count, 1.1, 12, 8, 'Pelagibacter', fig_path) # 1.1 was 1.62